From a4bef0ca826a8145ef3cb288846017c034a817c2 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 28 May 2024 12:15:50 -0700 Subject: [PATCH 001/230] [libc++] Mark P2845R8 `__cpp_lib_format_path` and P2587R3 `__cpp_lib_to_string` as C++26 (#93255) [P2845R8](https://wg21.link/P2845R8) "Formatting of `std::filesystem::path`" and [P2587R3](https://wg21.link/P2587R3) "`to_string` or not `to_string`" are C++26 features, so they should be marked accordingly in `generate_feature_test_macro_components.py`. I verified that without my changes, running the script produced no edits. Then with my changes, I ran the script to regenerate all files, with no other manual edits. Found while running libc++'s tests with MSVC's STL, which noticed this because it's currently a C++23-only implementation. Note that @H-G-Hristov has a draft implementation of P2587R3: #78100 --- libcxx/docs/FeatureTestMacroTable.rst | 8 ++-- libcxx/include/version | 4 +- .../filesystem.version.compile.pass.cpp | 23 +++------- .../string.version.compile.pass.cpp | 23 +++------- .../version.version.compile.pass.cpp | 46 ++++++------------- .../generate_feature_test_macro_components.py | 4 +- 6 files changed, 36 insertions(+), 72 deletions(-) diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index 17d2da907692e8..0297068785e8b8 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -326,8 +326,6 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_expected`` ``202211L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_format_path`` *unimplemented* - ---------------------------------------------------------- ----------------- ``__cpp_lib_format_ranges`` ``202207L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_formatters`` *unimplemented* @@ -386,8 +384,6 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_string_resize_and_overwrite`` ``202110L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_to_string`` *unimplemented* - ---------------------------------------------------------- ----------------- ``__cpp_lib_to_underlying`` ``202102L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_tuple_like`` *unimplemented* @@ -412,6 +408,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_default_template_type_for_algorithm_values`` *unimplemented* ---------------------------------------------------------- ----------------- + ``__cpp_lib_format_path`` *unimplemented* + ---------------------------------------------------------- ----------------- ``__cpp_lib_freestanding_algorithm`` *unimplemented* ---------------------------------------------------------- ----------------- ``__cpp_lib_freestanding_array`` *unimplemented* @@ -466,6 +464,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_to_chars`` *unimplemented* ---------------------------------------------------------- ----------------- + ``__cpp_lib_to_string`` *unimplemented* + ---------------------------------------------------------- ----------------- ``__cpp_lib_tuple_like`` *unimplemented* ========================================================== ================= diff --git a/libcxx/include/version b/libcxx/include/version index 69556d731f1cfc..140a9a0d870360 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -459,7 +459,6 @@ __cpp_lib_void_t 201411L # define __cpp_lib_constexpr_typeinfo 202106L # define __cpp_lib_containers_ranges 202202L # define __cpp_lib_expected 202211L -// # define __cpp_lib_format_path 202403L # define __cpp_lib_format_ranges 202207L // # define __cpp_lib_formatters 202302L # define __cpp_lib_forward_like 202207L @@ -490,7 +489,6 @@ __cpp_lib_void_t 201411L # define __cpp_lib_stdatomic_h 202011L # define __cpp_lib_string_contains 202011L # define __cpp_lib_string_resize_and_overwrite 202110L -// # define __cpp_lib_to_string 202306L # define __cpp_lib_to_underlying 202102L // # define __cpp_lib_tuple_like 202207L # define __cpp_lib_unreachable 202202L @@ -506,6 +504,7 @@ __cpp_lib_void_t 201411L // # define __cpp_lib_copyable_function 202306L // # define __cpp_lib_debugging 202311L // # define __cpp_lib_default_template_type_for_algorithm_values 202403L +// # define __cpp_lib_format_path 202403L // # define __cpp_lib_freestanding_algorithm 202311L // # define __cpp_lib_freestanding_array 202311L // # define __cpp_lib_freestanding_cstring 202306L @@ -537,6 +536,7 @@ __cpp_lib_void_t 201411L // # define __cpp_lib_text_encoding 202306L # undef __cpp_lib_to_chars // # define __cpp_lib_to_chars 202306L +// # define __cpp_lib_to_string 202306L # undef __cpp_lib_tuple_like // # define __cpp_lib_tuple_like 202311L #endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp index 308cc2d43b0586..4aba33482f69c4 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp @@ -20,7 +20,7 @@ /* Constant Value __cpp_lib_char8_t 201907L [C++20] __cpp_lib_filesystem 201703L [C++17] - __cpp_lib_format_path 202403L [C++23] + __cpp_lib_format_path 202403L [C++26] */ #include @@ -37,7 +37,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER == 14 @@ -51,7 +51,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER == 17 @@ -74,7 +74,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER == 20 @@ -106,7 +106,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER == 23 @@ -137,17 +137,8 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_format_path -# error "__cpp_lib_format_path should be defined in c++23" -# endif -# if __cpp_lib_format_path != 202403L -# error "__cpp_lib_format_path should have the value 202403L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!" -# endif +# ifdef __cpp_lib_format_path +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp index 16a9a0a28de635..af6386a40a458a 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp @@ -29,7 +29,7 @@ __cpp_lib_string_udls 201304L [C++14] __cpp_lib_string_view 201606L [C++17] 201803L [C++20] - __cpp_lib_to_string 202306L [C++23] + __cpp_lib_to_string 202306L [C++26] */ #include @@ -86,7 +86,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER == 14 @@ -143,7 +143,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER == 17 @@ -209,7 +209,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER == 20 @@ -293,7 +293,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER == 23 @@ -385,17 +385,8 @@ # error "__cpp_lib_string_view should have the value 201803L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_to_string -# error "__cpp_lib_to_string should be defined in c++23" -# endif -# if __cpp_lib_to_string != 202306L -# error "__cpp_lib_to_string should have the value 202306L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!" -# endif +# ifdef __cpp_lib_to_string +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 7829e06f90760b..c1e1f9f340af48 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -88,7 +88,7 @@ __cpp_lib_expected 202211L [C++23] __cpp_lib_filesystem 201703L [C++17] __cpp_lib_format 202106L [C++20] - __cpp_lib_format_path 202403L [C++23] + __cpp_lib_format_path 202403L [C++26] __cpp_lib_format_ranges 202207L [C++23] __cpp_lib_format_uchar 202311L [C++20] __cpp_lib_formatters 202302L [C++23] @@ -216,7 +216,7 @@ __cpp_lib_to_array 201907L [C++20] __cpp_lib_to_chars 201611L [C++17] 202306L [C++26] - __cpp_lib_to_string 202306L [C++23] + __cpp_lib_to_string 202306L [C++26] __cpp_lib_to_underlying 202102L [C++23] __cpp_lib_transformation_trait_aliases 201304L [C++14] __cpp_lib_transparent_operators 201210L [C++14] @@ -513,7 +513,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifdef __cpp_lib_format_ranges @@ -1005,7 +1005,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifdef __cpp_lib_to_underlying @@ -1348,7 +1348,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifdef __cpp_lib_format_ranges @@ -1891,7 +1891,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifdef __cpp_lib_to_underlying @@ -2303,7 +2303,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifdef __cpp_lib_format_ranges @@ -2972,7 +2972,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifdef __cpp_lib_to_underlying @@ -3543,7 +3543,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifdef __cpp_lib_format_ranges @@ -4350,7 +4350,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifdef __cpp_lib_to_underlying @@ -4971,17 +4971,8 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_format_path -# error "__cpp_lib_format_path should be defined in c++23" -# endif -# if __cpp_lib_format_path != 202403L -# error "__cpp_lib_format_path should have the value 202403L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!" -# endif +# ifdef __cpp_lib_format_path +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifndef __cpp_lib_format_ranges @@ -5943,17 +5934,8 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_to_string -# error "__cpp_lib_to_string should be defined in c++23" -# endif -# if __cpp_lib_to_string != 202306L -# error "__cpp_lib_to_string should have the value 202306L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!" -# endif +# ifdef __cpp_lib_to_string +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifndef __cpp_lib_to_underlying diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index b04cb4f5115547..1e79f6c140758c 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -515,7 +515,7 @@ def add_version_header(tc): }, { "name": "__cpp_lib_format_path", - "values": {"c++23": 202403}, # P2845R8: Formatting of std::filesystem::path + "values": {"c++26": 202403}, # P2845R8: Formatting of std::filesystem::path "headers": ["filesystem"], "unimplemented": True, }, @@ -1270,7 +1270,7 @@ def add_version_header(tc): }, { "name": "__cpp_lib_to_string", - "values": {"c++23": 202306}, # P2587R3 to_string or not to_string + "values": {"c++26": 202306}, # P2587R3 to_string or not to_string "headers": ["string"], "unimplemented": True, }, From 51752ed0dd737f12014a89dec67d25494083153d Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Tue, 28 May 2024 21:17:31 +0200 Subject: [PATCH 002/230] [mlir][nvgpu] verify the module --- mlir/test/Examples/NVGPU/tools/nvdsl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/test/Examples/NVGPU/tools/nvdsl.py b/mlir/test/Examples/NVGPU/tools/nvdsl.py index 600cae5b47eeec..90dbb2355e1c87 100644 --- a/mlir/test/Examples/NVGPU/tools/nvdsl.py +++ b/mlir/test/Examples/NVGPU/tools/nvdsl.py @@ -431,7 +431,7 @@ def __str__(self): # saveIR(module) # Verify the module - # module.operation.verify() + module.operation.verify() # Compile and JIT MLIR module options = f"cubin-chip=sm_90a cubin-features=+ptx80 opt-level=3" From 266fac8375bdf3f039503c559bb16ffab8895ae5 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 28 May 2024 12:17:57 -0700 Subject: [PATCH 003/230] [libc++] [test] Fix MSVC warnings (#93257) Found while running libc++'s tests with MSVC's STL. * Avoid MSVC warning C5101: use of preprocessor directive in function-like macro argument list is undefined behavior. + We can easily make this portable by extracting `const bool is_newlib`. + Followup to #73440. + See #73598. + See #73836. * Avoid MSVC warning C4267: 'return': conversion from 'size_t' to 'int', possible loss of data. + This warning is valid, but harmless for the test, so `static_cast` will avoid it. * Avoid MSVC warning C4146: unary minus operator applied to unsigned type, result still unsigned. + This warning is also valid (the scenario is sometimes intentional, but surprising enough that it's worth warning about). This is a C++17 test, so we can easily avoid it by testing `is_signed_v` at compile-time before testing `m < 0` and `n < 0` at run-time. * Silence MSVC warning C4310: cast truncates constant value. + These warnings are being emitted by `T(255)`. Disabling the warning is simpler than attempting to restructure the code. + Followup to #79791. * MSVC no longer emits warning C4521: multiple copy constructors specified. + This warning was removed from the compiler, since at least 2021-12-09. --- .../atomics.ref/compare_exchange_strong.pass.cpp | 3 +++ .../atomics.ref/compare_exchange_weak.pass.cpp | 3 +++ libcxx/test/std/atomics/atomics.ref/wait.pass.cpp | 3 +++ .../views.span/span.cons/initializer_list.pass.cpp | 4 ++-- .../syserr.errcat.objects/generic_category.pass.cpp | 11 +++++++---- .../syserr.errcat.objects/system_category.pass.cpp | 11 +++++++---- .../numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp | 10 ++++++---- libcxx/test/support/msvc_stdlib_force_include.h | 1 - 8 files changed, 31 insertions(+), 15 deletions(-) diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp index 72b2f444c476c7..90aa5ea5b6df45 100644 --- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp +++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp @@ -9,6 +9,9 @@ // XFAIL: !has-64-bit-atomics // XFAIL: !has-1024-bit-atomics +// MSVC warning C4310: cast truncates constant value +// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310 + // bool compare_exchange_strong(T&, T, memory_order, memory_order) const noexcept; // bool compare_exchange_strong(T&, T, memory_order = memory_order::seq_cst) const noexcept; diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp index 5219a8e3714f98..99c1385a2fe0b7 100644 --- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp +++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp @@ -9,6 +9,9 @@ // XFAIL: !has-64-bit-atomics // XFAIL: !has-1024-bit-atomics +// MSVC warning C4310: cast truncates constant value +// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310 + // bool compare_exchange_weak(T&, T, memory_order, memory_order) const noexcept; // bool compare_exchange_weak(T&, T, memory_order = memory_order::seq_cst) const noexcept; diff --git a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp index e5310febf5c5eb..f246803ba25925 100644 --- a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp +++ b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp @@ -11,6 +11,9 @@ // XFAIL: !has-64-bit-atomics // XFAIL: !has-1024-bit-atomics +// MSVC warning C4310: cast truncates constant value +// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310 + // void wait(T, memory_order = memory_order::seq_cst) const noexcept; #include diff --git a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp index 74a5094f61261d..bc76e23fea3c03 100644 --- a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp +++ b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp @@ -93,9 +93,9 @@ constexpr bool test() { // Test P2447R4 "Annex C examples" -constexpr int three(std::span sp) { return sp.size(); } +constexpr int three(std::span sp) { return static_cast(sp.size()); } -constexpr int four(std::span sp) { return sp.size(); } +constexpr int four(std::span sp) { return static_cast(sp.size()); } bool test_P2447R4_annex_c_examples() { // 1. Overload resolution is affected diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp index d4bbde75ae8821..7283fdc769d86b 100644 --- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp +++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp @@ -50,13 +50,16 @@ int main(int, char**) // responds with an empty message, which we probably want to // treat as a failure code otherwise, but we can detect that // with the preprocessor. +#if defined(_NEWLIB_VERSION) + const bool is_newlib = true; +#else + const bool is_newlib = false; +#endif + (void)is_newlib; LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0 // AIX || msg.rfind("No error information", 0) == 0 // Musl || msg.rfind("Unknown error", 0) == 0 // Glibc -#if defined(_NEWLIB_VERSION) - || msg.empty() -#endif - ); + || (is_newlib && msg.empty())); assert(errno == E2BIG); } diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp index eefbddd27a7f53..02a1baf5999831 100644 --- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp +++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp @@ -56,13 +56,16 @@ int main(int, char**) { // responds with an empty message, which we probably want to // treat as a failure code otherwise, but we can detect that // with the preprocessor. +#if defined(_NEWLIB_VERSION) + const bool is_newlib = true; +#else + const bool is_newlib = false; +#endif + (void)is_newlib; LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0 // AIX || msg.rfind("No error information", 0) == 0 // Musl || msg.rfind("Unknown error", 0) == 0 // Glibc -#if defined(_NEWLIB_VERSION) - || msg.empty() -#endif - ); + || (is_newlib && msg.empty())); assert(errno == E2BIG); } diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp index 212804356a056d..bf40b174b209cc 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp @@ -57,10 +57,12 @@ T basic_gcd_(T m, T n) { template T basic_gcd(T m, T n) { using Tp = std::make_unsigned_t; - if (m < 0 && m != std::numeric_limits::min()) - m = -m; - if (n < 0 && n != std::numeric_limits::min()) - n = -n; + if constexpr (std::is_signed_v) { + if (m < 0 && m != std::numeric_limits::min()) + m = -m; + if (n < 0 && n != std::numeric_limits::min()) + n = -n; + } return basic_gcd_(static_cast(m), static_cast(n)); } diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h index 6c26085e72c45f..35783c1607b0e0 100644 --- a/libcxx/test/support/msvc_stdlib_force_include.h +++ b/libcxx/test/support/msvc_stdlib_force_include.h @@ -67,7 +67,6 @@ const AssertionDialogAvoider assertion_dialog_avoider{}; // Silence compiler warnings. # pragma warning(disable : 4180) // qualifier applied to function type has no meaning; ignored # pragma warning(disable : 4324) // structure was padded due to alignment specifier -# pragma warning(disable : 4521) // multiple copy constructors specified # pragma warning(disable : 4702) // unreachable code # pragma warning(disable : 28251) // Inconsistent annotation for 'new': this instance has no annotations. #endif // !defined(__clang__) From 2ba08386156ef25913b1bee170d8fe95aaceb234 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 28 May 2024 12:20:58 -0700 Subject: [PATCH 004/230] [libc++] [test] Fix portability issues for MSVC (#93259) * Guard `std::__make_from_tuple_impl` tests with `#ifdef _LIBCPP_VERSION` and `LIBCPP_STATIC_ASSERT`. * Change `_LIBCPP_CONSTEXPR_SINCE_CXX20` to `TEST_CONSTEXPR_CXX20`. + Other functions in `variant.swap/swap.pass.cpp` were already using the proper test macro. * Mark `what` as `[[maybe_unused]]` when used by `TEST_LIBCPP_REQUIRE`. + This updates one occurrence in `libcxx/test/libcxx` for consistency. * Windows `_putenv_s()` takes 2 arguments, not 3. + See MSVC documentation: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/putenv-s-wputenv-s?view=msvc-170 + POSIX `setenv()` takes `int overwrite`, but Windows `_putenv_s()` always overwrites. * Avoid non-Standard zero-length arrays. + Followup to #74183 and #79792. * Add `operator++()` to `unsized_it`. + The Standard requires this due to [N4981][] [move.iter.requirements]/1 "The template parameter `Iterator` shall either meet the *Cpp17InputIterator* requirements ([input.iterators]) or model `input_iterator` ([iterator.concept.input])." + MSVC's STL requires this because it has a strengthened exception specification in `move_iterator` that inspects the underlying iterator's increment operator. * `uniform_int_distribution` forbids `int8_t`/`uint8_t`. + See [N4981][] [rand.req.genl]/1.5. MSVC's STL enforces this. + Note that when changing the distribution's `IntType`, we need to be careful to preserve the original value range of `[0, max_input]`. * fstreams are constructible from `const fs::path::value_type*` on wide systems. + See [ifstream.cons], [ofstream.cons], [fstream.cons]. * In `msvc_stdlib_force_include.h`, map `_HAS_CXX23` to `TEST_STD_VER` 23 instead of 99. + On 2023-05-23, https://github.com/llvm/llvm-project/commit/71400505ca048507e827013eb1ea0bc863525cab started recognizing 23 as a distinct value. * Fix test name typo: `destory_elements.pass.cpp` => `destroy_elements.pass.cpp` [N4981]: https://wg21.link/N4981 --- .../time.zone.db.tzdb/locate_zone.pass.cpp | 2 +- .../ranges.contains_subrange.pass.cpp | 25 +++++++++-------- ...nts.pass.cpp => destroy_elements.pass.cpp} | 0 .../fstreams/fstream.cons/path.pass.cpp | 2 +- .../fstreams/ifstream.cons/path.pass.cpp | 2 +- .../fstreams/ofstream.cons/path.pass.cpp | 2 +- .../sized_sentinel.compile.pass.cpp | 1 + .../numeric.ops/numeric.ops.gcd/gcd.pass.cpp | 9 ++++-- .../time.zone.db.access/current_zone.pass.cpp | 2 +- .../time.zone.db.access/locate_zone.pass.cpp | 2 +- .../time.zone.db.tzdb/current_zone.pass.cpp | 2 +- .../time.zone.db.tzdb/locate_zone.pass.cpp | 2 +- .../tuple.apply/make_from_tuple.pass.cpp | 28 ++++++++++--------- .../variant.swap/swap.pass.cpp | 2 +- .../test/support/msvc_stdlib_force_include.h | 2 +- 15 files changed, 45 insertions(+), 38 deletions(-) rename libcxx/test/std/containers/sequences/vector/vector.modifiers/{destory_elements.pass.cpp => destroy_elements.pass.cpp} (100%) diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp index 3ee213358f3524..08c682964c3745 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp @@ -73,7 +73,7 @@ L link link_to_link TEST_VALIDATE_EXCEPTION( std::runtime_error, [&]([[maybe_unused]] const std::runtime_error& e) { - std::string_view what{"tzdb: requested time zone not found"}; + [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"}; TEST_LIBCPP_REQUIRE( e.what() == what, TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception ", e.what(), '\n')); diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp index 761691c2afdcb9..890ac23fff8327 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp @@ -24,6 +24,7 @@ // Proj1 proj1 = {}, Proj2 proj2 = {}); // since C++23 #include +#include #include #include #include @@ -130,10 +131,10 @@ constexpr void test_iterators() { } { // range has zero length - int a[] = {}; - int p[] = {3, 4, 2}; - auto whole = std::ranges::subrange(Iter1(a), Sent1(Iter1(a))); - auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p)))); + std::array a = {}; + int p[] = {3, 4, 2}; + auto whole = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data()))); + auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p)))); { bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end()); assert(!ret); @@ -145,10 +146,10 @@ constexpr void test_iterators() { } { // subrange has zero length - int a[] = {3, 4, 2}; - int p[] = {}; - auto whole = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a)))); - auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p))); + int a[] = {3, 4, 2}; + std::array p = {}; + auto whole = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a)))); + auto subrange = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data()))); { bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end()); assert(ret); @@ -160,10 +161,10 @@ constexpr void test_iterators() { } { // range and subrange both have zero length - int a[] = {}; - int p[] = {}; - auto whole = std::ranges::subrange(Iter1(a), Sent1(Iter1(a))); - auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p))); + std::array a = {}; + std::array p = {}; + auto whole = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data()))); + auto subrange = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data()))); { bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end()); assert(ret); diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp similarity index 100% rename from libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp rename to libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp index 5edf22eaacf31f..d6bb56d9b78b79 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp @@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() { static_assert(!std::is_constructible_v>); // Char* pointers - if constexpr (!std::is_same_v) + if constexpr (!std::is_same_v && !std::is_same_v) static_assert(!std::is_constructible_v); // Iterators diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp index 2f27fd8e6e93d3..792b65615679a7 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp @@ -38,7 +38,7 @@ constexpr bool test_non_convert_to_path() { static_assert(!std::is_constructible_v>); // Char* pointers - if constexpr (!std::is_same_v) + if constexpr (!std::is_same_v && !std::is_same_v) static_assert(!std::is_constructible_v); // Iterators diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp index e55adfd83fc3c7..602bdadd85813f 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp @@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() { static_assert(!std::is_constructible_v>); // Char* pointers - if constexpr (!std::is_same_v) + if constexpr (!std::is_same_v && !std::is_same_v) static_assert(!std::is_constructible_v); // Iterators diff --git a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp index cb49086dd6802b..998b13ed494552 100644 --- a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp @@ -21,6 +21,7 @@ struct unsized_it { using difference_type = std::ptrdiff_t; value_type& operator*() const; + unsized_it& operator++(); bool operator==(const unsized_it&) const; difference_type operator-(const unsized_it&) const { return 0; } }; diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp index bf40b174b209cc..6a9ec1a2ffec24 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -69,12 +70,14 @@ T basic_gcd(T m, T n) { template void do_fuzzy_tests() { std::mt19937 gen(1938); - std::uniform_int_distribution distrib; + using DistIntType = std::conditional_t; // See N4981 [rand.req.genl]/1.5 + constexpr Input max_input = std::numeric_limits::max(); + std::uniform_int_distribution distrib(0, max_input); constexpr int nb_rounds = 10000; for (int i = 0; i < nb_rounds; ++i) { - Input n = distrib(gen); - Input m = distrib(gen); + Input n = static_cast(distrib(gen)); + Input m = static_cast(distrib(gen)); assert(std::gcd(n, m) == basic_gcd(n, m)); } } diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp index 2c43e121613c77..f31a679dd6214f 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp @@ -32,7 +32,7 @@ static void set_tz(std::string zone) { // Unlike POSIX it does not mention the string of putenv becomes part // of the environment. - int status = _putenv_s("TZ", zone.c_str(), 1); + int status = _putenv_s("TZ", zone.c_str()); assert(status == 0); } diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp index 4d600fcdf40e3f..8dd895fd21814f 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp @@ -40,7 +40,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) { TEST_VALIDATE_EXCEPTION( std::runtime_error, [&]([[maybe_unused]] const std::runtime_error& e) { - std::string_view what{"tzdb: requested time zone not found"}; + [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"}; TEST_LIBCPP_REQUIRE( e.what() == what, TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception ", e.what(), '\n')); diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp index e6497e26323ce6..98509c298ebcb8 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp @@ -34,7 +34,7 @@ static void set_tz(std::string zone) { // Unlike POSIX it does not mention the string of putenv becomes part // of the environment. - int status = _putenv_s("TZ", zone.c_str(), 1); + int status = _putenv_s("TZ", zone.c_str()); assert(status == 0); } diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp index f929dafcc96838..08ce48dfd0edb2 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp @@ -42,7 +42,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) { TEST_VALIDATE_EXCEPTION( std::runtime_error, [&]([[maybe_unused]] const std::runtime_error& e) { - std::string_view what{"tzdb: requested time zone not found"}; + [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"}; TEST_LIBCPP_REQUIRE( e.what() == what, TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception ", e.what(), '\n')); diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp index d7374351afa8bf..accb601dd00365 100644 --- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp +++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp @@ -209,6 +209,7 @@ template static constexpr bool can_make_from_tuple = std::is_same_v(T{}, Tuple{})), uint8_t>; +#ifdef _LIBCPP_VERSION template auto test_make_from_tuple_impl(T&&, Tuple&& t) -> decltype(std::__make_from_tuple_impl( @@ -224,6 +225,7 @@ uint32_t test_make_from_tuple_impl(...) { template static constexpr bool can_make_from_tuple_impl = std::is_same_v(T{}, Tuple{})), uint8_t>; +#endif // _LIBCPP_VERSION struct A { int a; @@ -263,23 +265,23 @@ static_assert(can_make_from_tuple>); // Test std::__make_from_tuple_impl constraints. // reinterpret_cast -static_assert(!can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); // const_cast -static_assert(!can_make_from_tuple_impl>); -static_assert(!can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); // static_cast -static_assert(!can_make_from_tuple_impl>); -static_assert(!can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); } // namespace LWG3528 diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp index db05691c55818c..039a2373348c4e 100644 --- a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp +++ b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp @@ -516,7 +516,7 @@ constexpr void test_swap_sfinae() { } } -_LIBCPP_CONSTEXPR_SINCE_CXX20 void test_swap_noexcept() { +TEST_CONSTEXPR_CXX20 void test_swap_noexcept() { { using V = std::variant; static_assert(std::is_swappable_v && has_swap_member(), ""); diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h index 35783c1607b0e0..785670224c3b18 100644 --- a/libcxx/test/support/msvc_stdlib_force_include.h +++ b/libcxx/test/support/msvc_stdlib_force_include.h @@ -90,7 +90,7 @@ const AssertionDialogAvoider assertion_dialog_avoider{}; #include #if _HAS_CXX23 -# define TEST_STD_VER 99 +# define TEST_STD_VER 23 #elif _HAS_CXX20 # define TEST_STD_VER 20 #elif _HAS_CXX17 From bc247ba113543b07fcff769ab616cf9509eb2794 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 May 2024 12:42:31 -0700 Subject: [PATCH 005/230] [memprof] Rename memprof-merge-v0.test to memprof-merge-versions.test (#93602) Despite the name, the test is used to test merge/show roundtrips for different MemProf versions. This patch renames the test to match the reality. --- .../{memprof-merge-v0.test => memprof-merge-versions.test} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llvm/test/tools/llvm-profdata/{memprof-merge-v0.test => memprof-merge-versions.test} (100%) diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-v0.test b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test similarity index 100% rename from llvm/test/tools/llvm-profdata/memprof-merge-v0.test rename to llvm/test/tools/llvm-profdata/memprof-merge-versions.test From 1c3a3f0e79a9c6a7c1c4a71c43a9eab783c3b266 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 12:49:42 -0700 Subject: [PATCH 006/230] [LegalizeTypes] Use VP_AND and VP_SHL/VP_SRA to promote operands fo VP arithmetic. (#92799) This adds VPSExtPromotedInteger and VPZExtPromotedInteger and uses them to promote many arithmetic operations. VPSExtPromotedInteger uses a shift pair because we don't have VP_SIGN_EXTEND_INREG yet. --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 113 ++++++++++++------ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 21 ++++ llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 12 +- llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll | 6 +- .../RISCV/rvv/fixed-vectors-vdiv-vp.ll | 9 +- .../RISCV/rvv/fixed-vectors-vdivu-vp.ll | 5 +- .../RISCV/rvv/fixed-vectors-vmax-vp.ll | 9 +- .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll | 5 +- .../RISCV/rvv/fixed-vectors-vmin-vp.ll | 9 +- .../RISCV/rvv/fixed-vectors-vminu-vp.ll | 5 +- .../RISCV/rvv/fixed-vectors-vrem-vp.ll | 9 +- .../RISCV/rvv/fixed-vectors-vremu-vp.ll | 5 +- .../RISCV/rvv/fixed-vectors-vshl-vp.ll | 3 +- .../RISCV/rvv/fixed-vectors-vsra-vp.ll | 7 +- .../RISCV/rvv/fixed-vectors-vsrl-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll | 40 +++---- llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll | 2 +- llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll | 7 +- llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll | 5 +- 27 files changed, 201 insertions(+), 136 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 8fda35f0086329..12f1d005249d60 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -646,18 +646,21 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) { } } - // Zero extend to the promoted type and do the count there. - SDValue Op = ZExtPromotedInteger(N->getOperand(0)); - // Subtract off the extra leading bits in the bigger type. SDValue ExtractLeadingBits = DAG.getConstant( NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT); - if (!N->isVPOpcode()) + if (!N->isVPOpcode()) { + // Zero extend to the promoted type and do the count there. + SDValue Op = ZExtPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::SUB, dl, NVT, DAG.getNode(N->getOpcode(), dl, NVT, Op), ExtractLeadingBits); + } + SDValue Mask = N->getOperand(1); SDValue EVL = N->getOperand(2); + // Zero extend to the promoted type and do the count there. + SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL); return DAG.getNode(ISD::VP_SUB, dl, NVT, DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL), ExtractLeadingBits, Mask, EVL); @@ -681,11 +684,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) { } // Zero extend to the promoted type and do the count or parity there. - SDValue Op = ZExtPromotedInteger(N->getOperand(0)); - if (!N->isVPOpcode()) + if (!N->isVPOpcode()) { + SDValue Op = ZExtPromotedInteger(N->getOperand(0)); return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op); - return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, - N->getOperand(1), N->getOperand(2)); + } + + SDValue Mask = N->getOperand(1); + SDValue EVL = N->getOperand(2); + SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL); + return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, Mask, + EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) { @@ -1335,12 +1343,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FFREXP(SDNode *N) { SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); - if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) - RHS = ZExtPromotedInteger(RHS); - if (N->getOpcode() != ISD::VP_SHL) + if (N->getOpcode() != ISD::VP_SHL) { + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = ZExtPromotedInteger(RHS); + return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } + + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = VPZExtPromotedInteger(RHS, Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) { @@ -1364,27 +1379,39 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) { } SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) { - // Sign extend the input. - SDValue LHS = SExtPromotedInteger(N->getOperand(0)); - SDValue RHS = SExtPromotedInteger(N->getOperand(1)); - if (N->getNumOperands() == 2) + if (N->getNumOperands() == 2) { + // Sign extend the input. + SDValue LHS = SExtPromotedInteger(N->getOperand(0)); + SDValue RHS = SExtPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } assert(N->getNumOperands() == 4 && "Unexpected number of operands!"); assert(N->isVPOpcode() && "Expected VP opcode"); + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + // Sign extend the input. + SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL); + SDValue RHS = VPSExtPromotedInteger(N->getOperand(1), Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) { - // Zero extend the input. - SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); - SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); - if (N->getNumOperands() == 2) + if (N->getNumOperands() == 2) { + // Zero extend the input. + SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); + SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } assert(N->getNumOperands() == 4 && "Unexpected number of operands!"); assert(N->isVPOpcode() && "Expected VP opcode"); + // Zero extend the input. + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL); + SDValue RHS = VPZExtPromotedInteger(N->getOperand(1), Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) { @@ -1400,27 +1427,43 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) { } SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) { - // The input value must be properly sign extended. - SDValue LHS = SExtPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); - if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) - RHS = ZExtPromotedInteger(RHS); - if (N->getOpcode() != ISD::VP_SRA) + if (N->getOpcode() != ISD::VP_SRA) { + // The input value must be properly sign extended. + SDValue LHS = SExtPromotedInteger(N->getOperand(0)); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = ZExtPromotedInteger(RHS); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } + + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + // The input value must be properly sign extended. + SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = VPZExtPromotedInteger(RHS, Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) { - // The input value must be properly zero extended. - SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); - if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) - RHS = ZExtPromotedInteger(RHS); - if (N->getOpcode() != ISD::VP_SRL) + if (N->getOpcode() != ISD::VP_SRL) { + // The input value must be properly zero extended. + SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = ZExtPromotedInteger(RHS); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } + + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + // The input value must be properly zero extended. + SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = VPZExtPromotedInteger(RHS, Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) { @@ -1487,7 +1530,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VPFunnelShift(SDNode *N) { SDValue Mask = N->getOperand(3); SDValue EVL = N->getOperand(4); if (getTypeAction(Amt.getValueType()) == TargetLowering::TypePromoteInteger) - Amt = ZExtPromotedInteger(Amt); + Amt = VPZExtPromotedInteger(Amt, Mask, EVL); EVT AmtVT = Amt.getValueType(); SDLoc DL(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index d925089d5689f1..ba3c7582d5a8a2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -275,6 +275,27 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { return DAG.getZeroExtendInReg(Op, dl, OldVT); } + /// Get a promoted operand and zero extend it to the final size. + SDValue VPSExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) { + EVT OldVT = Op.getValueType(); + SDLoc dl(Op); + Op = GetPromotedInteger(Op); + // FIXME: Add VP_SIGN_EXTEND_INREG. + EVT VT = Op.getValueType(); + unsigned BitsDiff = VT.getScalarSizeInBits() - OldVT.getScalarSizeInBits(); + SDValue ShiftCst = DAG.getShiftAmountConstant(BitsDiff, VT, dl); + SDValue Shl = DAG.getNode(ISD::VP_SHL, dl, VT, Op, ShiftCst, Mask, EVL); + return DAG.getNode(ISD::VP_SRA, dl, VT, Shl, ShiftCst, Mask, EVL); + } + + /// Get a promoted operand and zero extend it to the final size. + SDValue VPZExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) { + EVT OldVT = Op.getValueType(); + SDLoc dl(Op); + Op = GetPromotedInteger(Op); + return DAG.getVPZeroExtendInReg(Op, Mask, EVL, dl, OldVT); + } + // Promote the given operand V (vector or scalar) according to N's specific // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll index fff280c005b542..df413b878172bd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll @@ -2574,9 +2574,8 @@ define @vp_ctlz_nxv1i9( %va, @vp_ctlz_nxv1i9( %va, @vp_ctlz_zero_undef_nxv1i9( %va, @vp_ctlz_zero_undef_nxv1i9( %va, @vp_ctpop_nxv1i9( %va, @vp_ctpop_nxv1i9( %va, @llvm.vp.ctpop.nxv1i9( %va, %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll index 29f8eaba900527..e3c7d02462cc7f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll @@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) define <8 x i7> @vdiv_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vdiv_vv_v8i7: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vdiv.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll index 3f8eb0ff276b7f..03bd85bf5e69e2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vdivu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK-LABEL: vdivu_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vdivu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.udiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll index 9789afda9344ad..0b0d758ad8ded8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll @@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smax.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) define <8 x i7> @vmax_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmax_vv_v8i7: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.smax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll index 36b0a4642b6169..98e630a0e59e5a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vmaxu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK-LABEL: vmaxu_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vmaxu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.umax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll index adb0a30f34d35a..a6e3764b37550d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll @@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smin.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) define <8 x i7> @vmin_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmin_vv_v8i7: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vmin.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.smin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll index 671ce82d4ae795..c59b65edd1ec10 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vminu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK-LABEL: vminu_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vminu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.umin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll index 4bbbad5ed0e0e8..ff8a63e371c8ef 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll @@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.srem.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) define <8 x i7> @vrem_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vrem_vv_v8i7: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vrem.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.srem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll index ee11307bddc88c..b5eec4142c7824 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vremu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK-LABEL: vremu_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vremu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.urem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll index c4b7c1f2f19f0f..16a0fddfa98277 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll @@ -10,9 +10,8 @@ define <8 x i7> @vsll_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex ; CHECK-LABEL: vsll_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.shl.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll index 7ea5b1f0b505a3..180fafa9659b1c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll @@ -10,11 +10,10 @@ define <8 x i7> @vsra_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex ; CHECK-LABEL: vsra_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsra.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.ashr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll index 9f9d4af0cc2f3f..22f04803eadd74 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vsrl_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex ; CHECK-LABEL: vsrl_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.lshr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll index bc5617957d7d08..2c5a3dfffc2cfc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll @@ -1282,18 +1282,17 @@ define @fshr_v1i9( %a, %b, ; CHECK-LABEL: fshr_v1i9: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 511 -; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vsll.vi v9, v9, 7, v0.t +; CHECK-NEXT: vand.vx v10, v10, a1, v0.t ; CHECK-NEXT: li a0, 9 ; CHECK-NEXT: vremu.vx v10, v10, a0, v0.t ; CHECK-NEXT: vadd.vi v10, v10, 7, v0.t ; CHECK-NEXT: vand.vi v11, v10, 15, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 7, v0.t ; CHECK-NEXT: vsrl.vv v9, v9, v11, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vi v10, v10, 15, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -1306,18 +1305,17 @@ define @fshl_v1i9( %a, %b, ; CHECK-LABEL: fshl_v1i9: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 511 -; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vsll.vi v9, v9, 7, v0.t -; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t +; CHECK-NEXT: vand.vx v10, v10, a1, v0.t ; CHECK-NEXT: li a0, 9 ; CHECK-NEXT: vremu.vx v10, v10, a0, v0.t -; CHECK-NEXT: vnot.v v11, v10, v0.t -; CHECK-NEXT: vand.vi v11, v11, 15, v0.t -; CHECK-NEXT: vsrl.vv v9, v9, v11, v0.t +; CHECK-NEXT: vand.vi v11, v10, 15, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v11, v0.t +; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vi v10, v10, 15, v0.t -; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 7, v0.t +; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vv v9, v9, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %res = call @llvm.vp.fshl.nxv1i9( %a, %b, %c, %m, i32 %evl) @@ -1330,15 +1328,14 @@ declare @llvm.vp.fshr.nxv1i4(, @fshr_v1i4( %a, %b, %c, %m, i32 zeroext %evl) { ; CHECK-LABEL: fshr_v1i4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vand.vi v10, v10, 15 -; CHECK-NEXT: li a1, 4 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vremu.vx v10, v10, a1, v0.t +; CHECK-NEXT: vand.vi v10, v10, 15, v0.t ; CHECK-NEXT: vand.vi v9, v9, 15, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vv v8, v8, v10, v0.t +; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vremu.vx v9, v10, a0, v0.t +; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: ret %trunca = call @llvm.vp.trunc.nxv1i4.nxv1i8( %a, %m, i32 zeroext %evl) @@ -1353,15 +1350,14 @@ declare @llvm.vp.fshl.nxv1i4(, @fshl_v1i4( %a, %b, %c, %m, i32 zeroext %evl) { ; CHECK-LABEL: fshl_v1i4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vand.vi v10, v10, 15 -; CHECK-NEXT: li a1, 4 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vremu.vx v10, v10, a1, v0.t +; CHECK-NEXT: vand.vi v10, v10, 15, v0.t ; CHECK-NEXT: vand.vi v9, v9, 15, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t +; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vremu.vx v9, v10, a0, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll index 26089706cf99ef..a4b7ca7f39768f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll @@ -9,11 +9,15 @@ declare @llvm.vp.sdiv.nxv8i7(, @vdiv_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vdiv_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vdiv.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll index f41b885a66eaae..67c3f9dbf2869a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll @@ -10,11 +10,12 @@ define @vdivu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vdivu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vdivu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll index 8a76467986620c..c15caa31bb0986 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll @@ -9,11 +9,15 @@ declare @llvm.vp.smax.nxv8i7(, @vmax_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vmax_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll index 1c74887c1b20fb..df494f8af7387c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll @@ -10,11 +10,12 @@ define @vmaxu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vmaxu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vmaxu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll index 1c71242c3c7d79..794a21c7c6abac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll @@ -9,11 +9,15 @@ declare @llvm.vp.smin.nxv8i7(, @vmin_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vmin_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vmin.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll index 6d89a9777cf917..d54de281a7fd28 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll @@ -10,11 +10,12 @@ define @vminu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vminu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vminu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll index cf85fd827b51f1..2ef96f4b3896fc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll @@ -9,11 +9,15 @@ declare @llvm.vp.srem.nxv8i7(, @vrem_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vrem_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vrem.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll index 61bdd5b8d3c8a7..1f1ed4a1269acb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll @@ -10,11 +10,12 @@ define @vremu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vremu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vremu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll index c04d5ea2da3c1b..380835494ed17d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll @@ -12,8 +12,8 @@ define @vsll_vx_nxv8i7( %a, i7 signext %b, poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll index 632c4db5c5bb57..cff8cc710d21f3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll @@ -9,13 +9,14 @@ declare @llvm.vp.ashr.nxv8i7(, @vsra_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vsra_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vsra.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll index ec5b7f3faf7ca8..ff6771b643031f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll @@ -10,11 +10,12 @@ define @vsrl_vx_nxv8i7( %a, i7 signext %b, poison, i7 %b, i32 0 From 0e96eebc7f681a7ce41f35909e609c7c61a11455 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Tue, 28 May 2024 12:52:45 -0700 Subject: [PATCH 007/230] [msan] Reland: Increase k num stack origin descrs (limited to non-PowerPC) (#93117) The original pull request (https://github.com/llvm/llvm-project/pull/92838) was reverted due to a PowerPC buildbot breakage (https://github.com/llvm/llvm-project/commit/df626dd11c360c58eddae813ce6a0524d0a53696). This reland limits the scope of the change to non-PowerPC platforms. I am unaware of any PowerPC use cases that would benefit from a larger kNumStackOriginDescrs constant. Original CL description: This increases the constant size of kNumStackOriginDescrs to 4M (64GB of BSS across two arrays), which ought to be enough for anybody. This is the easier alternative suggested by eugenis@ in https://github.com/llvm/llvm-project/pull/92826. --- compiler-rt/lib/msan/msan.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp index a2fc27de1901b4..9375e27d4f4d24 100644 --- a/compiler-rt/lib/msan/msan.cpp +++ b/compiler-rt/lib/msan/msan.cpp @@ -100,7 +100,17 @@ int msan_report_count = 0; // Array of stack origins. // FIXME: make it resizable. -static const uptr kNumStackOriginDescrs = 1024 * 1024; +// Although BSS memory doesn't cost anything until used, it is limited to 2GB +// in some configurations (e.g., "relocation R_X86_64_PC32 out of range: +// ... is not in [-2147483648, 2147483647]; references section '.bss'"). +// We use kNumStackOriginDescrs * (sizeof(char*) + sizeof(uptr)) == 64MB. +#ifdef SANITIZER_PPC +// soft_rss_limit test (release_origin.c) fails on PPC if kNumStackOriginDescrs +// is too high +static const uptr kNumStackOriginDescrs = 1 * 1024 * 1024; +#else +static const uptr kNumStackOriginDescrs = 4 * 1024 * 1024; +#endif // SANITIZER_PPC static const char *StackOriginDescr[kNumStackOriginDescrs]; static uptr StackOriginPC[kNumStackOriginDescrs]; static atomic_uint32_t NumStackOriginDescrs; From d9dec109375ded13d61da20877c399fb8fbb877d Mon Sep 17 00:00:00 2001 From: Lucile Rose Nihlen Date: Tue, 28 May 2024 19:53:21 +0000 Subject: [PATCH 008/230] [ci] limit parallel windows compile jobs to 24 (#93329) This is an experiment to see if we can prevent some of the compiler OOMs happening without unduly impacting the Windows build latency. --- .ci/monolithic-windows.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh index 4fd88ea81c84a8..91e719c52d4363 100755 --- a/.ci/monolithic-windows.sh +++ b/.ci/monolithic-windows.sh @@ -44,6 +44,8 @@ pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt # see https://github.com/llvm/llvm-project/pull/82393 and # https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40 # for further information. +# We limit the number of parallel compile jobs to 24 control memory +# consumption and improve build reliability. cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_ENABLE_PROJECTS="${projects}" \ -G Ninja \ @@ -58,7 +60,9 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D MLIR_ENABLE_BINDINGS_PYTHON=ON \ -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \ -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \ - -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" + -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \ + -D LLVM_PARALLEL_COMPILE_JOBS=16 \ + -D LLVM_PARALLEL_LINK_JOBS=4 echo "--- ninja" # Targets are not escaped as they are passed as separate arguments. From c96860aea2c77392bad16f1c4f55014164669de3 Mon Sep 17 00:00:00 2001 From: Piotr Zegar Date: Tue, 28 May 2024 22:09:34 +0200 Subject: [PATCH 009/230] [clang-tidy] Optimize realpath in readability-identifier-naming (#92659) - Reduce disk IO usage by adding cache to an realpath introduced by #81985 --- .../clang-tidy/readability/IdentifierNamingCheck.cpp | 12 ++++++++++-- .../clang-tidy/readability/IdentifierNamingCheck.h | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp index c3208392df1566..828f13805a6980 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp @@ -1414,13 +1414,21 @@ IdentifierNamingCheck::getDiagInfo(const NamingCheckId &ID, }}; } +StringRef IdentifierNamingCheck::getRealFileName(StringRef FileName) const { + auto Iter = RealFileNameCache.try_emplace(FileName); + SmallString<256U> &RealFileName = Iter.first->getValue(); + if (!Iter.second) + return RealFileName; + llvm::sys::fs::real_path(FileName, RealFileName); + return RealFileName; +} + const IdentifierNamingCheck::FileStyle & IdentifierNamingCheck::getStyleForFile(StringRef FileName) const { if (!GetConfigPerFile) return *MainFileStyle; - SmallString<128> RealFileName; - llvm::sys::fs::real_path(FileName, RealFileName); + StringRef RealFileName = getRealFileName(FileName); StringRef Parent = llvm::sys::path::parent_path(RealFileName); auto Iter = NamingStylesCache.find(Parent); if (Iter != NamingStylesCache.end()) diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h index 27c8e4bc768c40..646ec0eac8dd1c 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h +++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h @@ -205,6 +205,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck { const NamingCheckFailure &Failure) const override; const FileStyle &getStyleForFile(StringRef FileName) const; + StringRef getRealFileName(StringRef FileName) const; /// Find the style kind of a field in an anonymous record. StyleKind findStyleKindForAnonField( @@ -222,6 +223,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck { /// Stores the style options as a vector, indexed by the specified \ref /// StyleKind, for a given directory. mutable llvm::StringMap NamingStylesCache; + mutable llvm::StringMap> RealFileNameCache; FileStyle *MainFileStyle; ClangTidyContext *Context; const bool GetConfigPerFile; From 0aacef3abc41cfc8efb5f1b9483bc37599352a59 Mon Sep 17 00:00:00 2001 From: Mattan Elkaim <73639004+mattanelkaim@users.noreply.github.com> Date: Tue, 28 May 2024 23:19:01 +0300 Subject: [PATCH 010/230] [clang-tidy][NFC] Update identifier-length.rst (#93467) Swapped code blocks of parameter and variable, which have been confused (in a clang-tidy doc file) --- .../checks/readability/identifier-length.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst index 44d97f7b363bff..271970c292c8fa 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst @@ -28,10 +28,7 @@ The following options are described below: .. code-block:: c++ - int doubler(int x) // warns that x is too short - { - return 2 * x; - } + int i = 42; // warns that 'i' is too short This check does not have any fix suggestions in the general case since variable names have semantic value. @@ -50,7 +47,10 @@ The following options are described below: .. code-block:: c++ - int i = 42; // warns that 'i' is too short + int doubler(int x) // warns that x is too short + { + return 2 * x; + } This check does not have any fix suggestions in the general case since variable names have semantic value. From c108c1e94580d70e2be66172ab4397fcff004376 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 28 May 2024 13:26:36 -0700 Subject: [PATCH 011/230] [WebAssembly] Rename old EH tests to *-legacy (#93585) I think test files for the legacy and the new EH (exnref) are better be separate, and I'd like to use the current test file names for the new EH, rather than keeping the current files and naming the new ones as `-new` or something. --- .../WebAssembly/{cfg-stackify-eh.ll => cfg-stackify-eh-legacy.ll} | 0 .../CodeGen/WebAssembly/{exception.ll => exception-legacy.ll} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename llvm/test/CodeGen/WebAssembly/{cfg-stackify-eh.ll => cfg-stackify-eh-legacy.ll} (100%) rename llvm/test/CodeGen/WebAssembly/{exception.ll => exception-legacy.ll} (100%) diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll similarity index 100% rename from llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll rename to llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll similarity index 100% rename from llvm/test/CodeGen/WebAssembly/exception.ll rename to llvm/test/CodeGen/WebAssembly/exception-legacy.ll From 9e89d107a6ec2ade15eddb549fa473cf09bf230e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 May 2024 13:30:00 -0700 Subject: [PATCH 012/230] [memprof] Add MemProf format Version 3 (#93608) This patch adds Version 3 for development purposes. For now, this patch adds V3 as a copy of V2. For the most part, this patch adds "case Version3:" wherever "case Version2:" appears. One exception is writeMemProfV3, which is copied from writeMemProfV2 but updated to write out memprof::Version3 to the MemProf header. We'll incrementally modify writeMemProfV3 in subsequent patches. --- llvm/include/llvm/ProfileData/MemProf.h | 4 +- llvm/lib/ProfileData/InstrProfReader.cpp | 4 +- llvm/lib/ProfileData/InstrProfWriter.cpp | 52 +++++++++++++++++++ llvm/lib/ProfileData/MemProf.cpp | 4 ++ .../llvm-profdata/memprof-merge-versions.test | 6 +++ llvm/tools/llvm-profdata/llvm-profdata.cpp | 3 +- 6 files changed, 70 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 17cef15344285b..d44a2d1e2fb117 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -28,10 +28,12 @@ enum IndexedVersion : uint64_t { Version1 = 1, // Version 2: Added a call stack table. Version2 = 2, + // Version 3: Under development. + Version3 = 3, }; constexpr uint64_t MinimumSupportedVersion = Version0; -constexpr uint64_t MaximumSupportedVersion = Version2; +constexpr uint64_t MaximumSupportedVersion = Version3; // Verify that the minimum and maximum satisfy the obvious constraint. static_assert(MinimumSupportedVersion <= MaximumSupportedVersion); diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 836206a4fd86e2..798236c295194a 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -1212,7 +1212,8 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start, const uint64_t FirstWord = support::endian::readNext(Ptr); - if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2) { + if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2 || + FirstWord == memprof::Version3) { // Everything is good. We can proceed to deserialize the rest. Version = static_cast(FirstWord); } else if (FirstWord >= 24) { @@ -1559,6 +1560,7 @@ IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const { "MemProfCallStackTable must not be available"); return getMemProfRecordV0(IndexedRecord, *MemProfFrameTable); case memprof::Version2: + case memprof::Version3: assert(MemProfFrameTable && "MemProfFrameTable must be available"); assert(MemProfCallStackTable && "MemProfCallStackTable must be available"); return getMemProfRecordV2(IndexedRecord, *MemProfFrameTable, diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index b67a9700b680ab..b16714ae8b9a2d 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -617,6 +617,56 @@ static Error writeMemProfV2(ProfOStream &OS, return Error::success(); } +// Write out MemProf Version3 as follows: +// uint64_t Version +// uint64_t RecordTableOffset = RecordTableGenerator.Emit +// uint64_t FramePayloadOffset = Offset for the frame payload +// uint64_t FrameTableOffset = FrameTableGenerator.Emit +// uint64_t CallStackPayloadOffset = Offset for the call stack payload +// uint64_t CallStackTableOffset = CallStackTableGenerator.Emit +// uint64_t Num schema entries +// uint64_t Schema entry 0 +// uint64_t Schema entry 1 +// .... +// uint64_t Schema entry N - 1 +// OnDiskChainedHashTable MemProfRecordData +// OnDiskChainedHashTable MemProfFrameData +// OnDiskChainedHashTable MemProfCallStackData +static Error writeMemProfV3(ProfOStream &OS, + memprof::IndexedMemProfData &MemProfData, + bool MemProfFullSchema) { + OS.write(memprof::Version3); + uint64_t HeaderUpdatePos = OS.tell(); + OS.write(0ULL); // Reserve space for the memprof record table offset. + OS.write(0ULL); // Reserve space for the memprof frame payload offset. + OS.write(0ULL); // Reserve space for the memprof frame table offset. + OS.write(0ULL); // Reserve space for the memprof call stack payload offset. + OS.write(0ULL); // Reserve space for the memprof call stack table offset. + + auto Schema = memprof::getHotColdSchema(); + if (MemProfFullSchema) + Schema = memprof::getFullSchema(); + writeMemProfSchema(OS, Schema); + + uint64_t RecordTableOffset = writeMemProfRecords(OS, MemProfData.RecordData, + &Schema, memprof::Version3); + + uint64_t FramePayloadOffset = OS.tell(); + uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfData.FrameData); + + uint64_t CallStackPayloadOffset = OS.tell(); + uint64_t CallStackTableOffset = + writeMemProfCallStacks(OS, MemProfData.CallStackData); + + uint64_t Header[] = { + RecordTableOffset, FramePayloadOffset, FrameTableOffset, + CallStackPayloadOffset, CallStackTableOffset, + }; + OS.patch({{HeaderUpdatePos, Header, std::size(Header)}}); + + return Error::success(); +} + // Write out the MemProf data in a requested version. static Error writeMemProf(ProfOStream &OS, memprof::IndexedMemProfData &MemProfData, @@ -629,6 +679,8 @@ static Error writeMemProf(ProfOStream &OS, return writeMemProfV1(OS, MemProfData); case memprof::Version2: return writeMemProfV2(OS, MemProfData, MemProfFullSchema); + case memprof::Version3: + return writeMemProfV3(OS, MemProfData, MemProfFullSchema); } return make_error( diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index 89afe7c39027c6..2f0e53736c82e5 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -52,6 +52,7 @@ size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema, case Version1: return serializedSizeV0(*this, Schema); case Version2: + case Version3: return serializedSizeV2(*this, Schema); } llvm_unreachable("unsupported MemProf version"); @@ -95,6 +96,7 @@ size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema, case Version1: return serializedSizeV0(*this, Schema); case Version2: + case Version3: return serializedSizeV2(*this, Schema); } llvm_unreachable("unsupported MemProf version"); @@ -149,6 +151,7 @@ void IndexedMemProfRecord::serialize(const MemProfSchema &Schema, serializeV0(*this, Schema, OS); return; case Version2: + case Version3: serializeV2(*this, Schema, OS); return; } @@ -239,6 +242,7 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema, case Version1: return deserializeV0(Schema, Ptr); case Version2: + case Version3: return deserializeV2(Schema, Ptr); } llvm_unreachable("unsupported MemProf version"); diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-versions.test b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test index 28f65e0781bc63..aa7d0329425dc5 100644 --- a/llvm/test/tools/llvm-profdata/memprof-merge-versions.test +++ b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test @@ -19,6 +19,12 @@ RUN: llvm-profdata show %t.prof.v2 | FileCheck %s RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=2 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v2 RUN: llvm-profdata show %t.prof.v2 | FileCheck %s +RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3 +RUN: llvm-profdata show %t.prof.v3 | FileCheck %s + +RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3 +RUN: llvm-profdata show %t.prof.v3 | FileCheck %s + For now we only check the validity of the instrumented profile since we don't have a way to display the contents of the memprof indexed format yet. diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index 28c3afa1016473..fae6d1e989ab5a 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -306,7 +306,8 @@ cl::opt MemProfVersionRequested( cl::init(memprof::Version0), cl::values(clEnumValN(memprof::Version0, "0", "version 0"), clEnumValN(memprof::Version1, "1", "version 1"), - clEnumValN(memprof::Version2, "2", "version 2"))); + clEnumValN(memprof::Version2, "2", "version 2"), + clEnumValN(memprof::Version3, "3", "version 3"))); cl::opt MemProfFullSchema( "memprof-full-schema", cl::Hidden, cl::sub(MergeSubcommand), From 193e9007ef0bef6c881ab26746221f22ec674447 Mon Sep 17 00:00:00 2001 From: erichkeane Date: Tue, 28 May 2024 13:18:46 -0700 Subject: [PATCH 013/230] [OpenACC][NFC] Fix begin loc and split it from the directive location I discovered while working on something else that we were using the location of the directive name as the 'beginloc' which caused some problems in a few places. This patch makes it so our beginloc is the '#' as we originally designed, and then adds a DirectiveLoc concept to a construct for use diagnosing the name. --- clang/include/clang/AST/StmtOpenACC.h | 32 ++++++++++++++--------- clang/include/clang/Parse/Parser.h | 1 + clang/include/clang/Sema/SemaOpenACC.h | 3 ++- clang/lib/AST/StmtOpenACC.cpp | 13 +++++---- clang/lib/Parse/ParseOpenACC.cpp | 19 +++++++------- clang/lib/Sema/SemaOpenACC.cpp | 7 ++--- clang/lib/Sema/TreeTransform.h | 9 ++++--- clang/lib/Serialization/ASTReaderStmt.cpp | 1 + clang/lib/Serialization/ASTWriterStmt.cpp | 1 + 9 files changed, 49 insertions(+), 37 deletions(-) diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h index b706864798baaf..04daf511f58713 100644 --- a/clang/include/clang/AST/StmtOpenACC.h +++ b/clang/include/clang/AST/StmtOpenACC.h @@ -31,6 +31,8 @@ class OpenACCConstructStmt : public Stmt { /// The location of the directive statement, from the '#' to the last token of /// the directive. SourceRange Range; + /// The location of the directive name. + SourceLocation DirectiveLoc; /// The list of clauses. This is stored here as an ArrayRef, as this is the /// most convienient place to access the list, however the list itself should @@ -39,8 +41,9 @@ class OpenACCConstructStmt : public Stmt { protected: OpenACCConstructStmt(StmtClass SC, OpenACCDirectiveKind K, - SourceLocation Start, SourceLocation End) - : Stmt(SC), Kind(K), Range(Start, End) {} + SourceLocation Start, SourceLocation DirectiveLoc, + SourceLocation End) + : Stmt(SC), Kind(K), Range(Start, End), DirectiveLoc(DirectiveLoc) {} // Used only for initialization, the leaf class can initialize this to // trailing storage. @@ -59,6 +62,7 @@ class OpenACCConstructStmt : public Stmt { SourceLocation getBeginLoc() const { return Range.getBegin(); } SourceLocation getEndLoc() const { return Range.getEnd(); } + SourceLocation getDirectiveLoc() const { return DirectiveLoc; } ArrayRef clauses() const { return Clauses; } child_range children() { @@ -81,9 +85,11 @@ class OpenACCAssociatedStmtConstruct : public OpenACCConstructStmt { protected: OpenACCAssociatedStmtConstruct(StmtClass SC, OpenACCDirectiveKind K, - SourceLocation Start, SourceLocation End, - Stmt *AssocStmt) - : OpenACCConstructStmt(SC, K, Start, End), AssociatedStmt(AssocStmt) {} + SourceLocation Start, + SourceLocation DirectiveLoc, + SourceLocation End, Stmt *AssocStmt) + : OpenACCConstructStmt(SC, K, Start, DirectiveLoc, End), + AssociatedStmt(AssocStmt) {} void setAssociatedStmt(Stmt *S) { AssociatedStmt = S; } Stmt *getAssociatedStmt() { return AssociatedStmt; } @@ -126,10 +132,10 @@ class OpenACCComputeConstruct final friend class ASTStmtReader; friend class ASTContext; OpenACCComputeConstruct(unsigned NumClauses) - : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, - OpenACCDirectiveKind::Invalid, - SourceLocation{}, SourceLocation{}, - /*AssociatedStmt=*/nullptr) { + : OpenACCAssociatedStmtConstruct( + OpenACCComputeConstructClass, OpenACCDirectiveKind::Invalid, + SourceLocation{}, SourceLocation{}, SourceLocation{}, + /*AssociatedStmt=*/nullptr) { // We cannot send the TrailingObjects storage to the base class (which holds // a reference to the data) until it is constructed, so we have to set it // separately here. @@ -141,11 +147,11 @@ class OpenACCComputeConstruct final } OpenACCComputeConstruct(OpenACCDirectiveKind K, SourceLocation Start, - SourceLocation End, + SourceLocation DirectiveLoc, SourceLocation End, ArrayRef Clauses, Stmt *StructuredBlock) : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, K, Start, - End, StructuredBlock) { + DirectiveLoc, End, StructuredBlock) { assert(isOpenACCComputeDirectiveKind(K) && "Only parallel, serial, and kernels constructs should be " "represented by this type"); @@ -169,8 +175,8 @@ class OpenACCComputeConstruct final unsigned NumClauses); static OpenACCComputeConstruct * Create(const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc, - SourceLocation EndLoc, ArrayRef Clauses, - Stmt *StructuredBlock); + SourceLocation DirectiveLoc, SourceLocation EndLoc, + ArrayRef Clauses, Stmt *StructuredBlock); Stmt *getStructuredBlock() { return getAssociatedStmt(); } const Stmt *getStructuredBlock() const { diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 00b475e5b42824..d054b8cf0d2405 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -3659,6 +3659,7 @@ class Parser : public CodeCompletionHandler { struct OpenACCDirectiveParseInfo { OpenACCDirectiveKind DirKind; SourceLocation StartLoc; + SourceLocation DirLoc; SourceLocation EndLoc; SmallVector Clauses; // TODO OpenACC: As we implement support for the Atomic, Routine, Cache, and diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h index 6f69fa08939b82..66144de4340a8a 100644 --- a/clang/include/clang/Sema/SemaOpenACC.h +++ b/clang/include/clang/Sema/SemaOpenACC.h @@ -379,7 +379,7 @@ class SemaOpenACC : public SemaBase { /// Called after the construct has been parsed, but clauses haven't been /// parsed. This allows us to diagnose not-implemented, as well as set up any /// state required for parsing the clauses. - void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation StartLoc); + void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation DirLoc); /// Called after the directive, including its clauses, have been parsed and /// parsing has consumed the 'annot_pragma_openacc_end' token. This DOES @@ -400,6 +400,7 @@ class SemaOpenACC : public SemaBase { /// declaration group or associated statement. StmtResult ActOnEndStmtDirective(OpenACCDirectiveKind K, SourceLocation StartLoc, + SourceLocation DirLoc, SourceLocation EndLoc, ArrayRef Clauses, StmtResult AssocStmt); diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp index a381a8dd7b62c3..47899b344c97ab 100644 --- a/clang/lib/AST/StmtOpenACC.cpp +++ b/clang/lib/AST/StmtOpenACC.cpp @@ -23,15 +23,14 @@ OpenACCComputeConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) { return Inst; } -OpenACCComputeConstruct * -OpenACCComputeConstruct::Create(const ASTContext &C, OpenACCDirectiveKind K, - SourceLocation BeginLoc, SourceLocation EndLoc, - ArrayRef Clauses, - Stmt *StructuredBlock) { +OpenACCComputeConstruct *OpenACCComputeConstruct::Create( + const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc, + SourceLocation DirLoc, SourceLocation EndLoc, + ArrayRef Clauses, Stmt *StructuredBlock) { void *Mem = C.Allocate( OpenACCComputeConstruct::totalSizeToAlloc( Clauses.size())); - auto *Inst = new (Mem) - OpenACCComputeConstruct(K, BeginLoc, EndLoc, Clauses, StructuredBlock); + auto *Inst = new (Mem) OpenACCComputeConstruct(K, BeginLoc, DirLoc, EndLoc, + Clauses, StructuredBlock); return Inst; } diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index e9c60f76165b68..63afc18783a1f7 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -1347,11 +1347,13 @@ void Parser::ParseOpenACCCacheVarList() { ParseOpenACCVarList(OpenACCClauseKind::Invalid); } -Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() { - SourceLocation StartLoc = getCurToken().getLocation(); +Parser::OpenACCDirectiveParseInfo +Parser::ParseOpenACCDirective() { + SourceLocation StartLoc = ConsumeAnnotationToken(); + SourceLocation DirLoc = getCurToken().getLocation(); OpenACCDirectiveKind DirKind = ParseOpenACCDirectiveKind(*this); - getActions().OpenACC().ActOnConstruct(DirKind, StartLoc); + getActions().OpenACC().ActOnConstruct(DirKind, DirLoc); // Once we've parsed the construct/directive name, some have additional // specifiers that need to be taken care of. Atomic has an 'atomic-clause' @@ -1390,7 +1392,7 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() { break; case OpenACCDirectiveKind::Wait: // OpenACC has an optional paren-wrapped 'wait-argument'. - if (ParseOpenACCWaitArgument(StartLoc, /*IsDirective=*/true).Failed) + if (ParseOpenACCWaitArgument(DirLoc, /*IsDirective=*/true).Failed) T.skipToEnd(); else T.consumeClose(); @@ -1404,7 +1406,8 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() { } // Parses the list of clauses, if present, plus set up return value. - OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, SourceLocation{}, + OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, DirLoc, + SourceLocation{}, ParseOpenACCClauseList(DirKind)}; assert(Tok.is(tok::annot_pragma_openacc_end) && @@ -1421,7 +1424,6 @@ Parser::DeclGroupPtrTy Parser::ParseOpenACCDirectiveDecl() { assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token"); ParsingOpenACCDirectiveRAII DirScope(*this); - ConsumeAnnotationToken(); OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective(); @@ -1438,7 +1440,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() { assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token"); ParsingOpenACCDirectiveRAII DirScope(*this); - ConsumeAnnotationToken(); OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective(); if (getActions().OpenACC().ActOnStartStmtDirective(DirInfo.DirKind, @@ -1456,6 +1457,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() { } return getActions().OpenACC().ActOnEndStmtDirective( - DirInfo.DirKind, DirInfo.StartLoc, DirInfo.EndLoc, DirInfo.Clauses, - AssocStmt); + DirInfo.DirKind, DirInfo.StartLoc, DirInfo.DirLoc, DirInfo.EndLoc, + DirInfo.Clauses, AssocStmt); } diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 09d91b31cfe5f9..15239f4f35c39f 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -844,7 +844,7 @@ ExprResult SemaOpenACC::CheckReductionVar(Expr *VarExpr) { } void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K, - SourceLocation StartLoc) { + SourceLocation DirLoc) { switch (K) { case OpenACCDirectiveKind::Invalid: // Nothing to do here, an invalid kind has nothing we can check here. We @@ -859,7 +859,7 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K, // here as these constructs do not take any arguments. break; default: - Diag(StartLoc, diag::warn_acc_construct_unimplemented) << K; + Diag(DirLoc, diag::warn_acc_construct_unimplemented) << K; break; } } @@ -1265,6 +1265,7 @@ bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K, StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K, SourceLocation StartLoc, + SourceLocation DirLoc, SourceLocation EndLoc, ArrayRef Clauses, StmtResult AssocStmt) { @@ -1278,7 +1279,7 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K, case OpenACCDirectiveKind::Kernels: // TODO OpenACC: Add clauses to the construct here. return OpenACCComputeConstruct::Create( - getASTContext(), K, StartLoc, EndLoc, Clauses, + getASTContext(), K, StartLoc, DirLoc, EndLoc, Clauses, AssocStmt.isUsable() ? AssocStmt.get() : nullptr); } llvm_unreachable("Unhandled case in directive handling?"); diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index dee335b526991b..765e6177d202d1 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -4033,11 +4033,12 @@ class TreeTransform { StmtResult RebuildOpenACCComputeConstruct(OpenACCDirectiveKind K, SourceLocation BeginLoc, + SourceLocation DirLoc, SourceLocation EndLoc, ArrayRef Clauses, StmtResult StrBlock) { - return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, EndLoc, - Clauses, StrBlock); + return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, DirLoc, + EndLoc, Clauses, StrBlock); } private: @@ -11559,8 +11560,8 @@ StmtResult TreeTransform::TransformOpenACCComputeConstruct( getSema().OpenACC().ActOnAssociatedStmt(C->getDirectiveKind(), StrBlock); return getDerived().RebuildOpenACCComputeConstruct( - C->getDirectiveKind(), C->getBeginLoc(), C->getEndLoc(), - TransformedClauses, StrBlock); + C->getDirectiveKind(), C->getBeginLoc(), C->getDirectiveLoc(), + C->getEndLoc(), TransformedClauses, StrBlock); } //===----------------------------------------------------------------------===// diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index eac4faff285490..bea2b949891070 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2797,6 +2797,7 @@ void ASTStmtReader::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) { (void)Record.readInt(); S->Kind = Record.readEnum(); S->Range = Record.readSourceRange(); + S->DirectiveLoc = Record.readSourceLocation(); Record.readOpenACCClauseList(S->Clauses); } diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index a44852af97bea3..3c586b270fbf4f 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2847,6 +2847,7 @@ void ASTStmtWriter::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) { Record.push_back(S->clauses().size()); Record.writeEnum(S->Kind); Record.AddSourceRange(S->Range); + Record.AddSourceLocation(S->DirectiveLoc); Record.writeOpenACCClauseList(S->clauses()); } From 5a23d31c5033dcb41d374692ed26d87ed8e2665a Mon Sep 17 00:00:00 2001 From: William Junda Huang Date: Tue, 28 May 2024 16:41:53 -0400 Subject: [PATCH 014/230] [Sample Profile] Check hot callsite threshold when inlining a function with a sample profile (#93286) Currently if a callsite is hot as determined by the sample profile, it is unconditionally inlined barring invalid cases (such as recursion). Inline cost check should still apply because a function's hotness and its inline cost are two different things. For example if a function is calling another very large function multiple times (at different code paths), the large function should not be inlined even if its hot. --- llvm/lib/Transforms/IPO/SampleProfile.cpp | 7 ++- .../Inputs/inline-hot-callsite-threshold.prof | 3 + .../inline-hot-callsite-threshold.ll | 61 +++++++++++++++++++ .../SampleProfile/pseudo-probe-inline.ll | 2 +- llvm/test/Transforms/SampleProfile/remarks.ll | 4 +- 5 files changed, 71 insertions(+), 6 deletions(-) create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof create mode 100644 llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 0920179fb76b73..92ad4c34da6e7e 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -1391,10 +1391,11 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { return InlineCost::getAlways("preinliner"); } - // For old FDO inliner, we inline the call site as long as cost is not - // "Never". The cost-benefit check is done earlier. + // For old FDO inliner, we inline the call site if it is below hot threshold, + // even if the function is hot based on sample profile data. This is to + // prevent huge functions from being inlined. if (!CallsitePrioritizedInline) { - return InlineCost::get(Cost.getCost(), INT_MAX); + return InlineCost::get(Cost.getCost(), SampleHotCallSiteThreshold); } // Otherwise only use the cost from call analyzer, but overwite threshold with diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof new file mode 100644 index 00000000000000..d1c0408210f498 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof @@ -0,0 +1,3 @@ +foo:100:100 + 1: bar:100 + 1:100 diff --git a/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll new file mode 100644 index 00000000000000..914ab4f1e3da58 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=100 2>&1 | FileCheck %s + +; CHECK: remark: a.cc:6:12: 'bar' inlined into 'foo' to match profiling context with (cost={{.*}}, threshold=100) +; CHECK: define dso_local noundef i32 @foo(i32 noundef %0) +; CHECK-NOT: %2 = tail call noundef i32 @bar(i32 noundef %0) +; CHECK-NEXT: %2 = icmp sgt i32 %0, 1 +; CHECK-NEXT: br i1 %2, label %3, label %bar.exit + +; Manually lower cost threshold for hot function inlining, so that the function +; is not inlined even profile indicates it as hot. +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=1 2>&1 | FileCheck %s --check-prefix=COST + +; COST-NOT: remark +; COST: define dso_local noundef i32 @foo(i32 noundef %0) +; COST-NEXT: %2 = tail call noundef i32 @bar(i32 noundef %0) + +define dso_local noundef i32 @bar(i32 noundef %0) #0 !dbg !10 { + %2 = icmp sgt i32 %0, 1 + br i1 %2, label %3, label %15 +3: ; preds = %1 + %4 = add nsw i32 %0, -2 + %5 = mul i32 %4, %4 + %6 = add i32 %5, %0 + %7 = zext nneg i32 %4 to i33 + %8 = add nsw i32 %0, -3 + %9 = zext i32 %8 to i33 + %10 = mul i33 %7, %9 + %11 = lshr i33 %10, 1 + %12 = trunc nuw i33 %11 to i32 + %13 = xor i32 %12, -1 + %14 = add i32 %6, %13 + br label %15 +15: ; preds = %3, %1 + %16 = phi i32 [ 0, %1 ], [ %14, %3 ] + ret i32 %16 +} + +define dso_local noundef i32 @foo(i32 noundef %0) #1 !dbg !20 { + %2 = tail call noundef i32 @bar(i32 noundef %0), !dbg !24 + ret i32 %2 +} + +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "use-sample-profile" } +attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "use-sample-profile" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug) +!1 = !DIFile(filename: "a.cc", directory: ".") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!10 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: !1, file: !1, line: 1, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!11 = !DIFile(filename: "a.cc", directory: ".") +!12 = !DISubroutineType(types: !13) +!13 = !{!14, !14} +!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !11, file: !11, line: 5, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!23 = !DILocation(line: 0, scope: !20) +!24 = !DILocation(line: 6, column: 12, scope: !20) diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll index 18cbd857d97bb2..2cd9abf0e11e94 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll @@ -98,7 +98,7 @@ if.end: ;YAML-NEXT: - String: '(cost=' ;YAML-NEXT: - Cost: '15' ;YAML-NEXT: - String: ', threshold=' -;YAML-NEXT: - Threshold: '2147483647' +;YAML-NEXT: - Threshold: '3000' ;YAML-NEXT: - String: ')' ;YAML-NEXT: - String: ' at callsite ' ;YAML-NEXT: - String: foo diff --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll index 997e02bb5b5444..9c0143ae65ca77 100644 --- a/llvm/test/Transforms/SampleProfile/remarks.ll +++ b/llvm/test/Transforms/SampleProfile/remarks.ll @@ -22,7 +22,7 @@ ; We are expecting foo() to be inlined in main() (almost all the cycles are ; spent inside foo). -; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=2147483647) at callsite main:0:21; +; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=3000) at callsite main:0:21; ; CHECK: remark: remarks.cc:9:19: 'rand' inlined into 'main' to match profiling context with (cost=always): always inline attribute at callsite _Z3foov:6:19 @ main:0:21; ; The back edge for the loop is the hottest edge in the loop subgraph. @@ -51,7 +51,7 @@ ;YAML-NEXT: - String: '(cost=' ;YAML-NEXT: - Cost: '130' ;YAML-NEXT: - String: ', threshold=' -;YAML-NEXT: - Threshold: '2147483647' +;YAML-NEXT: - Threshold: '3000' ;YAML-NEXT: - String: ')' ;YAML-NEXT: - String: ' at callsite ' ;YAML-NEXT: - String: main From 6a47315a3cb2c6d381809f0ba5c89bd8dcdbcaa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= Date: Tue, 28 May 2024 22:45:32 +0200 Subject: [PATCH 015/230] [clang-repl] Even more tests create the Interpreter and must check host JIT support (#84758) --- .../Interpreter/CodeCompletionTest.cpp | 85 +++++++++++++++++++ .../Interpreter/IncrementalProcessingTest.cpp | 3 + 2 files changed, 88 insertions(+) diff --git a/clang/unittests/Interpreter/CodeCompletionTest.cpp b/clang/unittests/Interpreter/CodeCompletionTest.cpp index 873fbda32f0579..72c02c683fafd4 100644 --- a/clang/unittests/Interpreter/CodeCompletionTest.cpp +++ b/clang/unittests/Interpreter/CodeCompletionTest.cpp @@ -4,6 +4,7 @@ #include "clang/Lex/Preprocessor.h" #include "clang/Sema/CodeCompleteConsumer.h" #include "clang/Sema/Sema.h" +#include "llvm/ExecutionEngine/Orc/LLJIT.h" #include "llvm/LineEditor/LineEditor.h" #include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" @@ -11,6 +12,10 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +#if defined(_AIX) || defined(__MVS__) +#define CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +#endif + using namespace clang; namespace { auto CB = clang::IncrementalCompilerBuilder(); @@ -50,7 +55,21 @@ static std::vector runComp(clang::Interpreter &MainInterp, return Comps; } +static bool HostSupportsJit() { + auto J = llvm::orc::LLJITBuilder().create(); + if (J) + return true; + LLVMConsumeError(llvm::wrap(J.takeError())); + return false; +} + +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_Sanity) { +#else TEST(CodeCompletionTest, Sanity) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int foo = 12;")); auto Err = llvm::Error::success(); @@ -61,7 +80,13 @@ TEST(CodeCompletionTest, Sanity) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_SanityNoneValid) { +#else TEST(CodeCompletionTest, SanityNoneValid) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int foo = 12;")); auto Err = llvm::Error::success(); @@ -70,7 +95,13 @@ TEST(CodeCompletionTest, SanityNoneValid) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_TwoDecls) { +#else TEST(CodeCompletionTest, TwoDecls) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int application = 12;")); cantFail(Interp->Parse("int apple = 12;")); @@ -80,14 +111,26 @@ TEST(CodeCompletionTest, TwoDecls) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_CompFunDeclsNoError) { +#else TEST(CodeCompletionTest, CompFunDeclsNoError) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); auto Err = llvm::Error::success(); auto comps = runComp(*Interp, "void app(", Err); EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_TypedDirected) { +#else TEST(CodeCompletionTest, TypedDirected) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int application = 12;")); cantFail(Interp->Parse("char apple = '2';")); @@ -119,7 +162,13 @@ TEST(CodeCompletionTest, TypedDirected) { } } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_SanityClasses) { +#else TEST(CodeCompletionTest, SanityClasses) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("struct Apple{};")); cantFail(Interp->Parse("void takeApple(Apple &a1){}")); @@ -142,7 +191,13 @@ TEST(CodeCompletionTest, SanityClasses) { } } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_SubClassing) { +#else TEST(CodeCompletionTest, SubClassing) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("struct Fruit {};")); cantFail(Interp->Parse("struct Apple : Fruit{};")); @@ -157,7 +212,13 @@ TEST(CodeCompletionTest, SubClassing) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_MultipleArguments) { +#else TEST(CodeCompletionTest, MultipleArguments) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int foo = 42;")); cantFail(Interp->Parse("char fowl = 'A';")); @@ -169,7 +230,13 @@ TEST(CodeCompletionTest, MultipleArguments) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_Methods) { +#else TEST(CodeCompletionTest, Methods) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse( "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};")); @@ -183,7 +250,13 @@ TEST(CodeCompletionTest, Methods) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_MethodsInvocations) { +#else TEST(CodeCompletionTest, MethodsInvocations) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse( "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};")); @@ -197,7 +270,13 @@ TEST(CodeCompletionTest, MethodsInvocations) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_NestedInvocations) { +#else TEST(CodeCompletionTest, NestedInvocations) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse( "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};")); @@ -212,7 +291,13 @@ TEST(CodeCompletionTest, NestedInvocations) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_TemplateFunctions) { +#else TEST(CodeCompletionTest, TemplateFunctions) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail( Interp->Parse("template T id(T a) { return a;} ")); diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp index f3b091b0c0e6cb..9a99ff6262fa3c 100644 --- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp +++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp @@ -61,6 +61,9 @@ TEST(IncrementalProcessing, DISABLED_EmitCXXGlobalInitFunc) { #else TEST(IncrementalProcessing, EmitCXXGlobalInitFunc) { #endif + if (!HostSupportsJit()) + GTEST_SKIP(); + std::vector ClangArgv = {"-Xclang", "-emit-llvm-only"}; auto CB = clang::IncrementalCompilerBuilder(); CB.SetCompilerArgs(ClangArgv); From 98fa0f6981f33b7d8f5aa38babc1e71bc0209de8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 28 May 2024 20:40:58 +0200 Subject: [PATCH 016/230] DAG: Handle vector splitting for fminnum_ieee/fmaxnum_ieee Avoids regression in future commit which starts producing illegal instances. --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 14e8708fd3f38f..361416edb554ca 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1174,8 +1174,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::FADD: case ISD::VP_FADD: case ISD::FSUB: case ISD::VP_FSUB: case ISD::FMUL: case ISD::VP_FMUL: - case ISD::FMINNUM: case ISD::VP_FMINNUM: - case ISD::FMAXNUM: case ISD::VP_FMAXNUM: + case ISD::FMINNUM: + case ISD::FMINNUM_IEEE: + case ISD::VP_FMINNUM: + case ISD::FMAXNUM: + case ISD::FMAXNUM_IEEE: + case ISD::VP_FMAXNUM: case ISD::FMINIMUM: case ISD::VP_FMINIMUM: case ISD::FMAXIMUM: From bbca20f0b1ab7c6ea36a84e88a6abb07f94ca80b Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Tue, 28 May 2024 23:04:12 +0200 Subject: [PATCH 017/230] [Clang][NFC] remove CHAR_PUNCT duplication introduced by #93216 (#93605) --- clang/include/clang/Basic/CharInfo.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h index 4d90528f7992e3..d71857e8e5dcc3 100644 --- a/clang/include/clang/Basic/CharInfo.h +++ b/clang/include/clang/Basic/CharInfo.h @@ -151,8 +151,7 @@ LLVM_READONLY inline bool isHexDigit(unsigned char c) { /// Note that '_' is both a punctuation character and an identifier character! LLVM_READONLY inline bool isPunctuation(unsigned char c) { using namespace charinfo; - return (InfoTable[c] & - (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT | CHAR_PUNCT)) != 0; + return (InfoTable[c] & (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT)) != 0; } /// Return true if this character is an ASCII printable character; that is, a From df542e1ed82bd4e5a9e345d3a3ae63a76893a0cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= Date: Tue, 28 May 2024 23:18:45 +0200 Subject: [PATCH 018/230] Fix build: [clang-repl] Even more tests create the Interpreter and must check host JIT support (#84758) fea7399e97b73a3209fcbe3338d412069769a637 had removed the unused function that was still there when I tested. --- clang/unittests/Interpreter/IncrementalProcessingTest.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp index 9a99ff6262fa3c..732753f11306e6 100644 --- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp +++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp @@ -56,6 +56,14 @@ const Function *getGlobalInit(llvm::Module *M) { return nullptr; } +static bool HostSupportsJit() { + auto J = llvm::orc::LLJITBuilder().create(); + if (J) + return true; + LLVMConsumeError(llvm::wrap(J.takeError())); + return false; +} + #ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT TEST(IncrementalProcessing, DISABLED_EmitCXXGlobalInitFunc) { #else From ed4227aad37f2c4adf307b63050fb9aee52b07f8 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 28 May 2024 14:37:15 -0700 Subject: [PATCH 019/230] [SCEV] Add tests for symbolic max BTC requiring predicates. Add extra tests for https://github.com/llvm/llvm-project/pull/93498. --- ...cated-symbolic-max-backedge-taken-count.ll | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll new file mode 100644 index 00000000000000..d40416359b65c6 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s + +; %i and %i + 1 can overflow. +define void @test1(i64 %x, ptr %a, ptr %b) { +; CHECK-LABEL: 'test1' +; CHECK-NEXT: Determining loop execution counts for: @test1 +; CHECK-NEXT: Loop %header: Unpredictable backedge-taken count. +; CHECK-NEXT: exit count for header: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: exit count for latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: Loop %header: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: symbolic max exit count for header: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: symbolic max exit count for latch: ***COULDNOTCOMPUTE*** +; +entry: + br label %header + +header: + %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ] + %i.010 = phi i32 [ 0, %entry ], [ %add, %latch ] + %add = add i32 %i.010, 1 + %idxprom = zext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom + %ld = load i32, ptr %arrayidx, align 4 + %uncountable.c = icmp eq i32 %ld, 10 + br i1 %uncountable.c, label %exit, label %latch + +latch: + %add2 = add nsw i32 %ld, 1 + %arrayidx4 = getelementptr inbounds i32, ptr %b, i64 %conv11 + store i32 %add2, ptr %arrayidx4, align 4 + %conv = zext i32 %add to i64 + %cmp = icmp ult i64 %conv, %x + br i1 %cmp, label %header, label %exit + +exit: + ret void +} + +; %i can overflow. +; +; We need to check that i doesn't wrap, but we don't need a run-time alias +; check. We also need an extra no-wrap check to get the backedge taken count. +define void @test2(i64 %x, ptr %a) { +; CHECK-LABEL: 'test2' +; CHECK-NEXT: Determining loop execution counts for: @test2 +; CHECK-NEXT: Loop %header: Unpredictable backedge-taken count. +; CHECK-NEXT: exit count for header: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: exit count for latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: Loop %header: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: symbolic max exit count for header: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: symbolic max exit count for latch: ***COULDNOTCOMPUTE*** +; +entry: + br label %header + +header: + %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ] + %i.010 = phi i32 [ 0, %entry ], [ %inc, %latch ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %conv11 + %ld = load i32, ptr %arrayidx, align 4 + %uncountable.c = icmp eq i32 %ld, 10 + br i1 %uncountable.c, label %exit, label %latch + +latch: + %add = add nsw i32 %ld, 1 + store i32 %add, ptr %arrayidx, align 4 + %inc = add i32 %i.010, 1 + %conv = zext i32 %inc to i64 + %cmp = icmp ult i64 %conv, %x + br i1 %cmp, label %header, label %exit + +exit: + ret void +} From e3f74d4589e29279e9f543b58577a2ece102dc6f Mon Sep 17 00:00:00 2001 From: erichkeane Date: Tue, 28 May 2024 14:25:13 -0700 Subject: [PATCH 020/230] [OpenACC] Correct serialization of certain clause sub-expressions For some reason I was using writeStmtRef when I meant writeStmt, so this corrects that. --- clang/lib/Serialization/ASTWriter.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index dd548fabfd9551..e830c4026ea78f 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7835,7 +7835,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { case OpenACCClauseKind::If: { const auto *IC = cast(C); writeSourceLocation(IC->getLParenLoc()); - writeStmtRef(IC->getConditionExpr()); + AddStmt(const_cast(IC->getConditionExpr())); return; } case OpenACCClauseKind::Self: { @@ -7843,7 +7843,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { writeSourceLocation(SC->getLParenLoc()); writeBool(SC->hasConditionExpr()); if (SC->hasConditionExpr()) - writeStmtRef(SC->getConditionExpr()); + AddStmt(const_cast(SC->getConditionExpr())); return; } case OpenACCClauseKind::NumGangs: { @@ -7857,13 +7857,13 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { case OpenACCClauseKind::NumWorkers: { const auto *NWC = cast(C); writeSourceLocation(NWC->getLParenLoc()); - writeStmtRef(NWC->getIntExpr()); + AddStmt(const_cast(NWC->getIntExpr())); return; } case OpenACCClauseKind::VectorLength: { const auto *NWC = cast(C); writeSourceLocation(NWC->getLParenLoc()); - writeStmtRef(NWC->getIntExpr()); + AddStmt(const_cast(NWC->getIntExpr())); return; } case OpenACCClauseKind::Private: { @@ -7942,15 +7942,15 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { writeSourceLocation(AC->getLParenLoc()); writeBool(AC->hasIntExpr()); if (AC->hasIntExpr()) - writeStmtRef(AC->getIntExpr()); + AddStmt(const_cast(AC->getIntExpr())); return; } case OpenACCClauseKind::Wait: { const auto *WC = cast(C); writeSourceLocation(WC->getLParenLoc()); writeBool(WC->getDevNumExpr()); - if (const Expr *DNE = WC->getDevNumExpr()) - writeStmtRef(DNE); + if (Expr *DNE = WC->getDevNumExpr()) + AddStmt(DNE); writeSourceLocation(WC->getQueuesLoc()); writeOpenACCIntExprList(WC->getQueueIdExprs()); From 060b3023e198d197b47c652f19af5f7dea3a22cc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 14:49:57 -0700 Subject: [PATCH 021/230] [RISCV] Move TRUNCATE_VECTOR_VL combine into a helper function. NFC (#93574) I plan to add other combines on TRUNCATE_VECTOR_VL. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 103 ++++++++++---------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c826892c1668ec..5fc613c1b2a140 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16087,6 +16087,57 @@ static bool matchIndexAsWiderOp(EVT VT, SDValue Index, SDValue Mask, return true; } +static SDValue combineTruncOfSraSext(SDNode *N, SelectionDAG &DAG) { + // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1)) + // This would be benefit for the cases where X and Y are both the same value + // type of low precision vectors. Since the truncate would be lowered into + // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate + // restriction, such pattern would be expanded into a series of "vsetvli" + // and "vnsrl" instructions later to reach this point. + auto IsTruncNode = [](SDValue V) { + if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL) + return false; + SDValue VL = V.getOperand(2); + auto *C = dyn_cast(VL); + // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand + bool IsVLMAXForVMSET = (C && C->isAllOnes()) || + (isa(VL) && + cast(VL)->getReg() == RISCV::X0); + return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL && IsVLMAXForVMSET; + }; + + SDValue Op = N->getOperand(0); + + // We need to first find the inner level of TRUNCATE_VECTOR_VL node + // to distinguish such pattern. + while (IsTruncNode(Op)) { + if (!Op.hasOneUse()) + return SDValue(); + Op = Op.getOperand(0); + } + + if (Op.getOpcode() != ISD::SRA || !Op.hasOneUse()) + return SDValue(); + + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + if (N0.getOpcode() != ISD::SIGN_EXTEND || !N0.hasOneUse() || + N1.getOpcode() != ISD::ZERO_EXTEND || !N1.hasOneUse()) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + if (!N00.getValueType().isVector() || + N00.getValueType() != N10.getValueType() || + N->getValueType(0) != N10.getValueType()) + return SDValue(); + + unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1; + SDValue SMin = + DAG.getNode(ISD::SMIN, SDLoc(N1), N->getValueType(0), N10, + DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0))); + return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin); +} SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -16304,56 +16355,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } } return SDValue(); - case RISCVISD::TRUNCATE_VECTOR_VL: { - // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1)) - // This would be benefit for the cases where X and Y are both the same value - // type of low precision vectors. Since the truncate would be lowered into - // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate - // restriction, such pattern would be expanded into a series of "vsetvli" - // and "vnsrl" instructions later to reach this point. - auto IsTruncNode = [](SDValue V) { - if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL) - return false; - SDValue VL = V.getOperand(2); - auto *C = dyn_cast(VL); - // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand - bool IsVLMAXForVMSET = (C && C->isAllOnes()) || - (isa(VL) && - cast(VL)->getReg() == RISCV::X0); - return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL && - IsVLMAXForVMSET; - }; - - SDValue Op = N->getOperand(0); - - // We need to first find the inner level of TRUNCATE_VECTOR_VL node - // to distinguish such pattern. - while (IsTruncNode(Op)) { - if (!Op.hasOneUse()) - return SDValue(); - Op = Op.getOperand(0); - } - - if (Op.getOpcode() == ISD::SRA && Op.hasOneUse()) { - SDValue N0 = Op.getOperand(0); - SDValue N1 = Op.getOperand(1); - if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && - N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse()) { - SDValue N00 = N0.getOperand(0); - SDValue N10 = N1.getOperand(0); - if (N00.getValueType().isVector() && - N00.getValueType() == N10.getValueType() && - N->getValueType(0) == N10.getValueType()) { - unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1; - SDValue SMin = DAG.getNode( - ISD::SMIN, SDLoc(N1), N->getValueType(0), N10, - DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0))); - return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin); - } - } - } - break; - } + case RISCVISD::TRUNCATE_VECTOR_VL: + return combineTruncOfSraSext(N, DAG); case ISD::TRUNCATE: return performTRUNCATECombine(N, DAG, Subtarget); case ISD::SELECT: From 00bd2fa1982f3114536323209fffad909463effc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 28 May 2024 14:57:13 -0700 Subject: [PATCH 022/230] [flang][cuda] Add bind c to cudadevice procedures (#92822) This patch adds bind c names to functions and subroutines in cudadevice so they can be lowered and not hit the intrinsic procedure TODOs. --- flang/module/cudadevice.f90 | 16 +++++----- flang/test/Lower/CUDA/cuda-device-proc.cuf | 36 ++++++++++++++++++++++ 2 files changed, 44 insertions(+), 8 deletions(-) create mode 100644 flang/test/Lower/CUDA/cuda-device-proc.cuf diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index f34820dd10792a..0224ecfdde7c60 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -18,34 +18,34 @@ module cudadevice ! Synchronization Functions interface - attributes(device) subroutine syncthreads() + attributes(device) subroutine syncthreads() bind(c, name='__syncthreads') end subroutine end interface public :: syncthreads interface - attributes(device) integer function syncthreads_and(value) + attributes(device) integer function syncthreads_and(value) bind(c, name='__syncthreads_and') integer :: value end function end interface public :: syncthreads_and interface - attributes(device) integer function syncthreads_count(value) + attributes(device) integer function syncthreads_count(value) bind(c, name='__syncthreads_count') integer :: value end function end interface public :: syncthreads_count interface - attributes(device) integer function syncthreads_or(value) + attributes(device) integer function syncthreads_or(value) bind(c, name='__syncthreads_or') integer :: value end function end interface public :: syncthreads_or interface - attributes(device) subroutine syncwarp(mask) + attributes(device) subroutine syncwarp(mask) bind(c, name='__syncwarp') integer :: mask end subroutine end interface @@ -54,19 +54,19 @@ attributes(device) subroutine syncwarp(mask) ! Memory Fences interface - attributes(device) subroutine threadfence() + attributes(device) subroutine threadfence() bind(c, name='__threadfence') end subroutine end interface public :: threadfence interface - attributes(device) subroutine threadfence_block() + attributes(device) subroutine threadfence_block() bind(c, name='__threadfence_block') end subroutine end interface public :: threadfence_block interface - attributes(device) subroutine threadfence_system() + attributes(device) subroutine threadfence_system() bind(c, name='__threadfence_system') end subroutine end interface public :: threadfence_system diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf new file mode 100644 index 00000000000000..0c71ea6efcd632 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -0,0 +1,36 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran procedures available in cudadevice module + +attributes(global) subroutine devsub() + implicit none + integer :: ret + + call syncthreads() + call syncwarp(1) + call threadfence() + call threadfence_block() + call threadfence_system() + ret = syncthreads_and(1) + ret = syncthreads_count(1) + ret = syncthreads_or(1) +end + +! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: fir.call @__syncthreads() +! CHECK: fir.call @__syncwarp(%{{.*}}) fastmath : (!fir.ref) -> () +! CHECK: fir.call @__threadfence() +! CHECK: fir.call @__threadfence_block() +! CHECK: fir.call @__threadfence_system() +! CHECK: %{{.*}} = fir.call @__syncthreads_and(%{{.*}}) fastmath : (!fir.ref) -> i32 +! CHECK: %{{.*}} = fir.call @__syncthreads_count(%{{.*}}) fastmath : (!fir.ref) -> i32 +! CHECK: %{{.*}} = fir.call @__syncthreads_or(%{{.*}}) fastmath : (!fir.ref) -> i32 + +! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads"} +! CHECK: func.func private @__syncwarp(!fir.ref {cuf.data_attr = #cuf.cuda}) attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncwarp"} +! CHECK: func.func private @__threadfence() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__threadfence"} +! CHECK: func.func private @__threadfence_block() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__threadfence_block"} +! CHECK: func.func private @__threadfence_system() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__threadfence_system"} +! CHECK: func.func private @__syncthreads_and(!fir.ref {cuf.data_attr = #cuf.cuda}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads_and"} +! CHECK: func.func private @__syncthreads_count(!fir.ref {cuf.data_attr = #cuf.cuda}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads_count"} +! CHECK: func.func private @__syncthreads_or(!fir.ref {cuf.data_attr = #cuf.cuda}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads_or"} From 2d00c6fe06b6d709b4ab3d6b253df304c04e0c1f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 15:05:23 -0700 Subject: [PATCH 023/230] [RISCV] Add a rematerializable pseudo instruction for LUI+ADDI for global addresses. (#93352) This allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization. This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling. This improves the dynamic instruction count on 531.deepsjeng_r from spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a 1% improvement. There are couple regressions, but they are 0.1% or smaller. AArch64 has similar pseudo instructions like MOVaddr --- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 20 ++ .../lib/Target/RISCV/RISCVMergeBaseOffset.cpp | 35 ++- .../RISCV/RISCVPostRAExpandPseudoInsts.cpp | 23 ++ llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll | 22 +- .../CodeGen/RISCV/ctz_zero_return_test.ll | 8 +- .../early-clobber-tied-def-subreg-liveness.ll | 14 +- .../test/CodeGen/RISCV/fold-addi-loadstore.ll | 4 +- llvm/test/CodeGen/RISCV/rv32xtheadbb.ll | 4 +- llvm/test/CodeGen/RISCV/rv32zbb.ll | 4 +- .../CodeGen/RISCV/rvv/active_lane_mask.ll | 40 +-- .../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 4 +- .../rvv/fixed-vectors-interleaved-access.ll | 275 +++++++++--------- .../RISCV/rvv/fixed-vectors-mask-buildvec.ll | 20 +- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 16 +- .../rvv/fixed-vectors-shuffle-reverse.ll | 80 ++--- .../RISCV/rvv/fixed-vectors-stepvector.ll | 10 +- .../test/CodeGen/RISCV/rvv/shuffle-reverse.ll | 50 ++-- llvm/test/CodeGen/RISCV/tail-calls.ll | 8 +- llvm/test/CodeGen/RISCV/unroll-loop-cse.ll | 32 +- 19 files changed, 358 insertions(+), 311 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index ce50fe6e2cbb02..a1b078910e29c9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1311,6 +1311,26 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12), /// HI and ADD_LO address nodes. +// Pseudo for a rematerializable LUI+ADDI sequence for loading an address. +// It will be expanded after register allocation. +// FIXME: The scheduling information does not reflect the multiple instructions. +let Size = 8, isReMaterializable = 1 in +def PseudoMovAddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>, + Sched<[WriteIALU]>; + +def riscv_hi_oneuse : unop_oneuse; +def addr_hi_lo : PatFrag<(ops node:$hi, node:$lo), + (riscv_add_lo (riscv_hi_oneuse node:$hi), node:$lo)>; + +def : Pat<(addr_hi_lo tglobaladdr:$hi, tglobaladdr:$lo), + (PseudoMovAddr tglobaladdr:$hi, tglobaladdr:$lo)>; +def : Pat<(addr_hi_lo tblockaddress:$hi, tblockaddress:$lo), + (PseudoMovAddr tblockaddress:$hi, tblockaddress:$lo)>; +def : Pat<(addr_hi_lo tjumptable:$hi, tjumptable:$lo), + (PseudoMovAddr tjumptable:$hi, tjumptable:$lo)>; +def : Pat<(addr_hi_lo tconstpool:$hi, tconstpool:$lo), + (PseudoMovAddr tconstpool:$hi, tconstpool:$lo)>; + def : Pat<(riscv_hi tglobaladdr:$in), (LUI tglobaladdr:$in)>; def : Pat<(riscv_hi tblockaddress:$in), (LUI tblockaddress:$in)>; def : Pat<(riscv_hi tjumptable:$in), (LUI tjumptable:$in)>; diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp index 410989177a8b9c..fecc83a821f420 100644 --- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -84,7 +84,8 @@ INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE, // 3) The offset value in the Global Address or Constant Pool is 0. bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi, MachineInstr *&Lo) { - if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC) + if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC && + Hi.getOpcode() != RISCV::PseudoMovAddr) return false; const MachineOperand &HiOp1 = Hi.getOperand(1); @@ -97,16 +98,22 @@ bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi, HiOp1.getOffset() != 0) return false; - Register HiDestReg = Hi.getOperand(0).getReg(); - if (!MRI->hasOneUse(HiDestReg)) - return false; + if (Hi.getOpcode() == RISCV::PseudoMovAddr) { + // Most of the code should handle it correctly without modification by + // setting Lo and Hi both point to PseudoMovAddr + Lo = &Hi; + } else { + Register HiDestReg = Hi.getOperand(0).getReg(); + if (!MRI->hasOneUse(HiDestReg)) + return false; - Lo = &*MRI->use_instr_begin(HiDestReg); - if (Lo->getOpcode() != RISCV::ADDI) - return false; + Lo = &*MRI->use_instr_begin(HiDestReg); + if (Lo->getOpcode() != RISCV::ADDI) + return false; + } const MachineOperand &LoOp2 = Lo->getOperand(2); - if (Hi.getOpcode() == RISCV::LUI) { + if (Hi.getOpcode() == RISCV::LUI || Hi.getOpcode() == RISCV::PseudoMovAddr) { if (LoOp2.getTargetFlags() != RISCVII::MO_LO || !(LoOp2.isGlobal() || LoOp2.isCPI() || LoOp2.isBlockAddress()) || LoOp2.getOffset() != 0) @@ -466,6 +473,13 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi, Hi.getOperand(1).setOffset(NewOffset); MachineOperand &ImmOp = Lo.getOperand(2); + // Expand PseudoMovAddr into LUI + if (Hi.getOpcode() == RISCV::PseudoMovAddr) { + auto *TII = ST->getInstrInfo(); + Hi.setDesc(TII->get(RISCV::LUI)); + Hi.removeOperand(2); + } + if (Hi.getOpcode() != RISCV::AUIPC) ImmOp.setOffset(NewOffset); @@ -501,6 +515,11 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi, } } + // Prevent Lo (originally PseudoMovAddr, which is also pointed by Hi) from + // being erased + if (&Lo == &Hi) + return true; + MRI->replaceRegWith(Lo.getOperand(0).getReg(), Hi.getOperand(0).getReg()); Lo.eraseFromParent(); return true; diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp index 52f2ce27164d6e..b7b0c47c084c64 100644 --- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp @@ -44,6 +44,7 @@ class RISCVPostRAExpandPseudo : public MachineFunctionPass { bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); bool expandMovImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool expandMovAddr(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); }; char RISCVPostRAExpandPseudo::ID = 0; @@ -75,6 +76,8 @@ bool RISCVPostRAExpandPseudo::expandMI(MachineBasicBlock &MBB, switch (MBBI->getOpcode()) { case RISCV::PseudoMovImm: return expandMovImm(MBB, MBBI); + case RISCV::PseudoMovAddr: + return expandMovAddr(MBB, MBBI); default: return false; } @@ -101,6 +104,26 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB, return true; } +bool RISCVPostRAExpandPseudo::expandMovAddr(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + DebugLoc DL = MBBI->getDebugLoc(); + + Register DstReg = MBBI->getOperand(0).getReg(); + bool DstIsDead = MBBI->getOperand(0).isDead(); + bool Renamable = MBBI->getOperand(0).isRenamable(); + + BuildMI(MBB, MBBI, DL, TII->get(RISCV::LUI)) + .addReg(DstReg, RegState::Define | getRenamableRegState(Renamable)) + .add(MBBI->getOperand(1)); + BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead) | + getRenamableRegState(Renamable)) + .addReg(DstReg, RegState::Kill | getRenamableRegState(Renamable)) + .add(MBBI->getOperand(2)); + MBBI->eraseFromParent(); + return true; +} + } // end of anonymous namespace INITIALIZE_PASS(RISCVPostRAExpandPseudo, "riscv-expand-pseudolisimm32", diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index 549d531e829ea5..a90c244437a033 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -383,8 +383,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI3_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI3_0) +; RV32I-NEXT: lui s4, %hi(.LCPI3_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0) ; RV32I-NEXT: neg a0, s2 ; RV32I-NEXT: and a0, s2, a0 ; RV32I-NEXT: mv a1, s3 @@ -442,9 +442,9 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32M-LABEL: test_cttz_i64: ; RV32M: # %bb.0: ; RV32M-NEXT: lui a2, 30667 -; RV32M-NEXT: addi a2, a2, 1329 -; RV32M-NEXT: lui a3, %hi(.LCPI3_0) -; RV32M-NEXT: addi a3, a3, %lo(.LCPI3_0) +; RV32M-NEXT: addi a3, a2, 1329 +; RV32M-NEXT: lui a2, %hi(.LCPI3_0) +; RV32M-NEXT: addi a2, a2, %lo(.LCPI3_0) ; RV32M-NEXT: bnez a1, .LBB3_3 ; RV32M-NEXT: # %bb.1: ; RV32M-NEXT: li a1, 32 @@ -452,18 +452,18 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32M-NEXT: .LBB3_2: ; RV32M-NEXT: neg a1, a0 ; RV32M-NEXT: and a0, a0, a1 -; RV32M-NEXT: mul a0, a0, a2 +; RV32M-NEXT: mul a0, a0, a3 ; RV32M-NEXT: srli a0, a0, 27 -; RV32M-NEXT: add a0, a3, a0 +; RV32M-NEXT: add a0, a2, a0 ; RV32M-NEXT: lbu a0, 0(a0) ; RV32M-NEXT: li a1, 0 ; RV32M-NEXT: ret ; RV32M-NEXT: .LBB3_3: ; RV32M-NEXT: neg a4, a1 ; RV32M-NEXT: and a1, a1, a4 -; RV32M-NEXT: mul a1, a1, a2 +; RV32M-NEXT: mul a1, a1, a3 ; RV32M-NEXT: srli a1, a1, 27 -; RV32M-NEXT: add a1, a3, a1 +; RV32M-NEXT: add a1, a2, a1 ; RV32M-NEXT: lbu a1, 0(a1) ; RV32M-NEXT: bnez a0, .LBB3_2 ; RV32M-NEXT: .LBB3_4: @@ -814,8 +814,8 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI7_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI7_0) +; RV32I-NEXT: lui s4, %hi(.LCPI7_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI7_0) ; RV32I-NEXT: neg a0, s1 ; RV32I-NEXT: and a0, s1, a0 ; RV32I-NEXT: mv a1, s3 diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll index 9ae30e646fdbf7..fe6e20d852d590 100644 --- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll +++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll @@ -48,8 +48,8 @@ define signext i32 @ctz_dereferencing_pointer(ptr %b) nounwind { ; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI0_0) -; RV32I-NEXT: addi s3, a0, %lo(.LCPI0_0) +; RV32I-NEXT: lui s3, %hi(.LCPI0_0) +; RV32I-NEXT: addi s3, s3, %lo(.LCPI0_0) ; RV32I-NEXT: neg a0, s4 ; RV32I-NEXT: and a0, s4, a0 ; RV32I-NEXT: mv a1, s1 @@ -511,8 +511,8 @@ define signext i32 @ctz4(i64 %b) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI6_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI6_0) +; RV32I-NEXT: lui s4, %hi(.LCPI6_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI6_0) ; RV32I-NEXT: neg a0, s2 ; RV32I-NEXT: and a0, s2, a0 ; RV32I-NEXT: mv a1, s3 diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll index eb6ac985287a10..478d2eae9dca2c 100644 --- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll +++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll @@ -24,31 +24,31 @@ define void @_Z3foov() { ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_49) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_49) ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_48) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_48) -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_46) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_46) -; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_45) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_45) -; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v12, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v14, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vs2r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_40) diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll index 3c2e84689c979c..62b1549a5d58ad 100644 --- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll +++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll @@ -389,8 +389,8 @@ define dso_local i32 @load_ga() local_unnamed_addr #0 { define dso_local i64 @load_ga_8() nounwind { ; RV32I-LABEL: load_ga_8: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lui a0, %hi(ga_8) -; RV32I-NEXT: addi a1, a0, %lo(ga_8) +; RV32I-NEXT: lui a1, %hi(ga_8) +; RV32I-NEXT: addi a1, a1, %lo(ga_8) ; RV32I-NEXT: lw a0, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll index b45ab135fa1c7c..197366e7e05fe8 100644 --- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll @@ -209,8 +209,8 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI3_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI3_0) +; RV32I-NEXT: lui s4, %hi(.LCPI3_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0) ; RV32I-NEXT: neg a0, s2 ; RV32I-NEXT: and a0, s2, a0 ; RV32I-NEXT: mv a1, s3 diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 7e6c3f9c87d277..f25aa0de89da88 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -199,8 +199,8 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI3_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI3_0) +; RV32I-NEXT: lui s4, %hi(.LCPI3_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0) ; RV32I-NEXT: neg a0, s2 ; RV32I-NEXT: and a0, s2, a0 ; RV32I-NEXT: mv a1, s3 diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll index 9cb3991f31f94d..08b310213d16e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -126,28 +126,28 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI9_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI9_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1) -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v24, v16 +; CHECK-NEXT: vsaddu.vx v16, v24, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 ; CHECK-NEXT: vsext.vf8 v16, v8 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v8, v16, a2 -; CHECK-NEXT: vsext.vf8 v16, v9 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 ; CHECK-NEXT: lui a0, %hi(.LCPI9_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmsltu.vx v10, v16, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v8, 2 +; CHECK-NEXT: vslideup.vi v0, v9, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v10, 4 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsext.vf8 v16, v8 ; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma @@ -169,13 +169,13 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vsext.vf8 v16, v8 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v10, v16, a2 +; CHECK-NEXT: vmsltu.vx v8, v16, a2 ; CHECK-NEXT: vsext.vf8 v16, v9 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v8, v16, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) ; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vmsltu.vx v10, v16, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_3) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) ; CHECK-NEXT: vle8.v v11, (a0) @@ -187,10 +187,10 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vmsltu.vx v11, v16, a2 ; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_4) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) ; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_5) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) ; CHECK-NEXT: vle8.v v13, (a0) @@ -201,27 +201,27 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vsaddu.vx v16, v16, a1 ; CHECK-NEXT: vmsltu.vx v13, v16, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vslideup.vi v10, v8, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vslideup.vi v10, v9, 4 ; CHECK-NEXT: lui a0, %hi(.LCPI10_6) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v11, 6 +; CHECK-NEXT: vslideup.vi v10, v11, 6 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v12, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v13, 4 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsext.vf8 v16, v8 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vmsltu.vx v8, v16, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v0, v9, 6 +; CHECK-NEXT: vslideup.vi v0, v8, 6 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vi v0, v8, 8 +; CHECK-NEXT: vslideup.vi v0, v10, 8 ; CHECK-NEXT: ret %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc) ret <128 x i1> %mask diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 79c36a629465d9..f4d7074c7f6b27 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -3459,6 +3459,8 @@ define void @mulhu_v4i64(ptr %x) { ; RV64-NEXT: lui a1, %hi(.LCPI184_0) ; RV64-NEXT: addi a1, a1, %lo(.LCPI184_0) ; RV64-NEXT: vle64.v v10, (a1) +; RV64-NEXT: vmulhu.vv v10, v8, v10 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: li a1, -1 ; RV64-NEXT: slli a1, a1, 63 ; RV64-NEXT: vmv.s.x v12, a1 @@ -3466,8 +3468,6 @@ define void @mulhu_v4i64(ptr %x) { ; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma ; RV64-NEXT: vslideup.vi v14, v12, 2 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmulhu.vv v10, v8, v10 -; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: vmulhu.vv v8, v8, v14 ; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a1, 12320 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 178a920169ad96..bc3e135a588a6f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -159,17 +159,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 82 +; RV32-NEXT: li a3, 80 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 82 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb ; RV32-NEXT: addi a3, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a3) ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 57 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 6 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill @@ -177,26 +176,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vslideup.vi v8, v16, 4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 41 +; RV32-NEXT: li a5, 40 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 12 -; RV32-NEXT: vmv.s.x v1, a4 +; RV32-NEXT: vmv.s.x v0, a4 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a5, a4, 6 -; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: li a5, 56 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vmv1r.v v3, v0 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vslideup.vi v8, v16, 10, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 45 +; RV32-NEXT: li a5, 44 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -206,8 +205,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: vle16.v v8, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a5, a4, 5 -; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: slli a4, a4, 5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill @@ -216,21 +214,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a5, 1 ; RV32-NEXT: vle16.v v8, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a6, 25 +; RV32-NEXT: li a6, 24 ; RV32-NEXT: mul a4, a4, a6 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 73 +; RV32-NEXT: li a4, 72 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v24, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -238,27 +236,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a5, -64 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v16, v8, v4 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 44 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -266,259 +263,257 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 44 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vslideup.vi v12, v8, 2 +; RV32-NEXT: vmv1r.v v8, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v1, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vs1r.v v3, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vslideup.vi v12, v16, 8, v0.t -; RV32-NEXT: vmv.v.v v20, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_2) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) ; RV32-NEXT: lui a3, %hi(.LCPI6_3) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_3) -; RV32-NEXT: lui a4, %hi(.LCPI6_4) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: vle16.v v16, (a3) -; RV32-NEXT: addi a1, a4, %lo(.LCPI6_4) +; RV32-NEXT: vle16.v v0, (a1) +; RV32-NEXT: vle16.v v4, (a3) +; RV32-NEXT: lui a1, %hi(.LCPI6_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v2, (a1) +; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v24, v8, v4 +; RV32-NEXT: vrgatherei16.vv v24, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v16, v4, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v24 +; RV32-NEXT: vmv.v.v v12, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v16, v24, v2 -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vrgatherei16.vv v12, v24, v10 +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v16, v8, 6, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v12, v24, 6, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_5) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) ; RV32-NEXT: lui a3, %hi(.LCPI6_6) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_6) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vle16.v v4, (a3) -; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 +; RV32-NEXT: li a3, 12 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 960 +; RV32-NEXT: vmv.s.x v8, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v16 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v0, v12 +; RV32-NEXT: vmv1r.v v3, v8 +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 12 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) ; RV32-NEXT: lui a3, %hi(.LCPI6_8) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_8) -; RV32-NEXT: lui a4, %hi(.LCPI6_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: addi a1, a4, %lo(.LCPI6_9) +; RV32-NEXT: lui a1, %hi(.LCPI6_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_9) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v24, (a3) -; RV32-NEXT: vle16.v v28, (a1) +; RV32-NEXT: vle16.v v4, (a3) +; RV32-NEXT: vle16.v v12, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v4, v0, v8 +; RV32-NEXT: vrgatherei16.vv v12, v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v4, v8, 4, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv4r.v v24, v16 +; RV32-NEXT: vslideup.vi v12, v16, 4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 +; RV32-NEXT: li a3, 12 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: vrgatherei16.vv v8, v16, v4 +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_10) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_10) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vle16.v v12, (a1) ; RV32-NEXT: lui a1, 15 ; RV32-NEXT: vmv.s.x v3, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v12, v16, 6 +; RV32-NEXT: vslideup.vi v8, v16, 6 ; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vrgatherei16.vv v8, v24, v12, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_11) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_11) ; RV32-NEXT: lui a3, %hi(.LCPI6_12) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_12) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: vle16.v v12, (a3) +; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v4, (a3) ; RV32-NEXT: li a1, 1008 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v8 +; RV32-NEXT: vrgatherei16.vv v8, v16, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v12, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 2 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_13) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_13) ; RV32-NEXT: lui a3, %hi(.LCPI6_14) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_14) -; RV32-NEXT: lui a4, %hi(.LCPI6_15) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a1) -; RV32-NEXT: addi a1, a4, %lo(.LCPI6_15) +; RV32-NEXT: lui a1, %hi(.LCPI6_15) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_15) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; RV32-NEXT: vle16.v v24, (a3) ; RV32-NEXT: vle16.v v8, (a1) @@ -526,27 +521,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vrgatherei16.vv v16, v8, v20, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -554,7 +548,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -562,12 +556,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: vrgatherei16.vv v8, v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 49 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -576,31 +570,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 21 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 13 +; RV32-NEXT: li a2, 12 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 57 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 2 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vmv.v.v v28, v0 ; RV32-NEXT: vmv.v.v v16, v8 ; RV32-NEXT: addi a1, a0, 320 @@ -614,21 +605,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 45 +; RV32-NEXT: li a2, 44 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 82 +; RV32-NEXT: li a1, 80 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll index 17483151869365..7608349ef7aeff 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll @@ -549,20 +549,20 @@ define <128 x i1> @buildvec_mask_v128i1() { define <128 x i1> @buildvec_mask_optsize_v128i1() optsize { ; CHECK-LABEL: buildvec_mask_optsize_v128i1: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI21_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI21_0) -; CHECK-NEXT: li a1, 128 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: li a0, 128 +; CHECK-NEXT: lui a1, %hi(.LCPI21_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI21_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_optsize_v128i1: ; ZVE32F: # %bb.0: -; ZVE32F-NEXT: lui a0, %hi(.LCPI21_0) -; ZVE32F-NEXT: addi a0, a0, %lo(.LCPI21_0) -; ZVE32F-NEXT: li a1, 128 -; ZVE32F-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; ZVE32F-NEXT: vlm.v v0, (a0) +; ZVE32F-NEXT: li a0, 128 +; ZVE32F-NEXT: lui a1, %hi(.LCPI21_0) +; ZVE32F-NEXT: addi a1, a1, %lo(.LCPI21_0) +; ZVE32F-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; ZVE32F-NEXT: vlm.v v0, (a1) ; ZVE32F-NEXT: ret ret <128 x i1> } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index db0969c85a8e24..69341981288b91 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -13327,22 +13327,22 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { ; RV32-LABEL: mgather_shuffle_vrgather: ; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI119_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI119_0) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v9, (a0) -; RV32-NEXT: lui a0, %hi(.LCPI119_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI119_0) +; RV32-NEXT: vle16.v v9, (a1) ; RV32-NEXT: vle16.v v10, (a0) -; RV32-NEXT: vrgather.vv v8, v9, v10 +; RV32-NEXT: vrgather.vv v8, v10, v9 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_shuffle_vrgather: ; RV64V: # %bb.0: +; RV64V-NEXT: lui a1, %hi(.LCPI119_0) +; RV64V-NEXT: addi a1, a1, %lo(.LCPI119_0) ; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vle16.v v9, (a0) -; RV64V-NEXT: lui a0, %hi(.LCPI119_0) -; RV64V-NEXT: addi a0, a0, %lo(.LCPI119_0) +; RV64V-NEXT: vle16.v v9, (a1) ; RV64V-NEXT: vle16.v v10, (a0) -; RV64V-NEXT: vrgather.vv v8, v9, v10 +; RV64V-NEXT: vrgather.vv v8, v10, v9 ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_shuffle_vrgather: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll index d70ed2fb0e2665..4b1f0beb487008 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -228,11 +228,11 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) { define <32 x i8> @reverse_v32i8(<32 x i8> %a) { ; CHECK-LABEL: reverse_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI12_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI12_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI12_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -243,11 +243,11 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) { define <64 x i8> @reverse_v64i8(<64 x i8> %a) { ; CHECK-LABEL: reverse_v64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI13_0) -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: lui a1, %hi(.LCPI13_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI13_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -323,11 +323,11 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) { define <32 x i16> @reverse_v32i16(<32 x i16> %a) { ; CHECK-LABEL: reverse_v32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI19_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI19_0) +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vsext.vf2 v16, v12 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 @@ -520,11 +520,11 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) { define <32 x half> @reverse_v32f16(<32 x half> %a) { ; CHECK-LABEL: reverse_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI34_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI34_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI34_0) +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vsext.vf2 v16, v12 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 @@ -820,33 +820,33 @@ define <6 x i64> @reverse_v6i64(<6 x i64> %a) { define <12 x i64> @reverse_v12i64(<12 x i64> %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_v12i64: ; RV32-BITS-UNKNOWN: # %bb.0: -; RV32-BITS-UNKNOWN-NEXT: lui a0, %hi(.LCPI46_0) -; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, %lo(.LCPI46_0) -; RV32-BITS-UNKNOWN-NEXT: li a1, 32 -; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vle16.v v24, (a0) +; RV32-BITS-UNKNOWN-NEXT: li a0, 32 +; RV32-BITS-UNKNOWN-NEXT: lui a1, %hi(.LCPI46_0) +; RV32-BITS-UNKNOWN-NEXT: addi a1, a1, %lo(.LCPI46_0) +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vle16.v v24, (a1) ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v16, v8, v24 ; RV32-BITS-UNKNOWN-NEXT: vmv.v.v v8, v16 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_v12i64: ; RV32-BITS-256: # %bb.0: -; RV32-BITS-256-NEXT: lui a0, %hi(.LCPI46_0) -; RV32-BITS-256-NEXT: addi a0, a0, %lo(.LCPI46_0) -; RV32-BITS-256-NEXT: li a1, 32 -; RV32-BITS-256-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-BITS-256-NEXT: vle16.v v24, (a0) +; RV32-BITS-256-NEXT: li a0, 32 +; RV32-BITS-256-NEXT: lui a1, %hi(.LCPI46_0) +; RV32-BITS-256-NEXT: addi a1, a1, %lo(.LCPI46_0) +; RV32-BITS-256-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-BITS-256-NEXT: vle16.v v24, (a1) ; RV32-BITS-256-NEXT: vrgatherei16.vv v16, v8, v24 ; RV32-BITS-256-NEXT: vmv.v.v v8, v16 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_v12i64: ; RV32-BITS-512: # %bb.0: -; RV32-BITS-512-NEXT: lui a0, %hi(.LCPI46_0) -; RV32-BITS-512-NEXT: addi a0, a0, %lo(.LCPI46_0) -; RV32-BITS-512-NEXT: li a1, 32 -; RV32-BITS-512-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-BITS-512-NEXT: vle16.v v24, (a0) +; RV32-BITS-512-NEXT: li a0, 32 +; RV32-BITS-512-NEXT: lui a1, %hi(.LCPI46_0) +; RV32-BITS-512-NEXT: addi a1, a1, %lo(.LCPI46_0) +; RV32-BITS-512-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-BITS-512-NEXT: vle16.v v24, (a1) ; RV32-BITS-512-NEXT: vrgatherei16.vv v16, v8, v24 ; RV32-BITS-512-NEXT: vmv.v.v v8, v16 ; RV32-BITS-512-NEXT: ret @@ -883,11 +883,11 @@ define <12 x i64> @reverse_v12i64(<12 x i64> %a) { ; ; RV32-ZVBB-LABEL: reverse_v12i64: ; RV32-ZVBB: # %bb.0: -; RV32-ZVBB-NEXT: lui a0, %hi(.LCPI46_0) -; RV32-ZVBB-NEXT: addi a0, a0, %lo(.LCPI46_0) -; RV32-ZVBB-NEXT: li a1, 32 -; RV32-ZVBB-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-ZVBB-NEXT: vle16.v v24, (a0) +; RV32-ZVBB-NEXT: li a0, 32 +; RV32-ZVBB-NEXT: lui a1, %hi(.LCPI46_0) +; RV32-ZVBB-NEXT: addi a1, a1, %lo(.LCPI46_0) +; RV32-ZVBB-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-ZVBB-NEXT: vle16.v v24, (a1) ; RV32-ZVBB-NEXT: vrgatherei16.vv v16, v8, v24 ; RV32-ZVBB-NEXT: vmv.v.v v8, v16 ; RV32-ZVBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll index 0161ac4bc338db..e2580c132f65e9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll @@ -225,11 +225,11 @@ declare <16 x i64> @llvm.experimental.stepvector.v16i64() define <16 x i64> @stepvector_v16i64() { ; RV32-LABEL: stepvector_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI16_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI16_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vle8.v v16, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: lui a1, %hi(.LCPI16_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI16_0) +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vle8.v v16, (a1) ; RV32-NEXT: vsext.vf4 v8, v16 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll index 6e327457bebffc..368f454fa5fda1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll @@ -106,11 +106,11 @@ define <16 x i8> @v16i8(<16 x i8> %a) { define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v16i8_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI7_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI7_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI7_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vmv1r.v v14, v9 ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vid.v v8 @@ -230,11 +230,11 @@ define <16 x i16> @v16i16(<16 x i16> %a) { define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: v16i16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI15_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI15_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI15_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI15_0) +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: vmv2r.v v20, v10 ; CHECK-NEXT: vmv2r.v v12, v8 ; CHECK-NEXT: vrgather.vv v8, v12, v16 @@ -363,11 +363,11 @@ define <16 x i32> @v16i32(<16 x i32> %a) { define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: v16i32_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI23_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI23_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle16.v v20, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI23_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI23_0) +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vle16.v v20, (a1) ; CHECK-NEXT: vmv4r.v v24, v12 ; CHECK-NEXT: vmv4r.v v16, v8 ; CHECK-NEXT: vrgatherei16.vv v8, v16, v20 @@ -548,11 +548,11 @@ define <16 x half> @v16f16(<16 x half> %a) { define <32 x half> @v16f16_2(<16 x half> %a) { ; CHECK-LABEL: v16f16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI35_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI35_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI35_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI35_0) +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -719,11 +719,11 @@ define <8 x double> @v4f64_2(<4 x double> %a, <4 x double> %b) { define <32 x i8> @v32i8(<32 x i8> %a) { ; CHECK-LABEL: v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI46_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI46_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI46_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll index 87d69bfad38c2b..d3e495bb723ad8 100644 --- a/llvm/test/CodeGen/RISCV/tail-calls.ll +++ b/llvm/test/CodeGen/RISCV/tail-calls.ll @@ -56,12 +56,12 @@ define void @caller_indirect_tail(i32 %a) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: beqz a0, .LBB3_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: lui a0, %hi(callee_indirect2) -; CHECK-NEXT: addi t1, a0, %lo(callee_indirect2) +; CHECK-NEXT: lui t1, %hi(callee_indirect2) +; CHECK-NEXT: addi t1, t1, %lo(callee_indirect2) ; CHECK-NEXT: jr t1 ; CHECK-NEXT: .LBB3_2: -; CHECK-NEXT: lui a0, %hi(callee_indirect1) -; CHECK-NEXT: addi t1, a0, %lo(callee_indirect1) +; CHECK-NEXT: lui t1, %hi(callee_indirect1) +; CHECK-NEXT: addi t1, t1, %lo(callee_indirect1) ; CHECK-NEXT: jr t1 diff --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll index 2fd4572d234567..65307363048376 100644 --- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll +++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll @@ -10,36 +10,30 @@ define signext i32 @unroll_loop_cse() { ; CHECK-LABEL: unroll_loop_cse: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(x) -; CHECK-NEXT: lw a3, %lo(x)(a1) -; CHECK-NEXT: lui a2, %hi(check) -; CHECK-NEXT: lw a4, %lo(check)(a2) +; CHECK-NEXT: lui a0, %hi(x) +; CHECK-NEXT: lw a1, %lo(x)(a0) +; CHECK-NEXT: lui a0, %hi(check) +; CHECK-NEXT: lw a2, %lo(check)(a0) ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: bne a3, a4, .LBB0_6 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi a1, a1, %lo(x) -; CHECK-NEXT: lw a1, 4(a1) -; CHECK-NEXT: addi a2, a2, %lo(check) -; CHECK-NEXT: lw a2, 4(a2) ; CHECK-NEXT: bne a1, a2, .LBB0_6 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a1, %hi(x) ; CHECK-NEXT: addi a1, a1, %lo(x) -; CHECK-NEXT: lw a3, 8(a1) +; CHECK-NEXT: lw a3, 4(a1) ; CHECK-NEXT: lui a2, %hi(check) ; CHECK-NEXT: addi a2, a2, %lo(check) +; CHECK-NEXT: lw a4, 4(a2) +; CHECK-NEXT: bne a3, a4, .LBB0_6 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: lw a3, 8(a1) ; CHECK-NEXT: lw a4, 8(a2) ; CHECK-NEXT: bne a3, a4, .LBB0_6 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: lw a1, 12(a1) -; CHECK-NEXT: lw a2, 12(a2) -; CHECK-NEXT: bne a1, a2, .LBB0_6 +; CHECK-NEXT: lw a3, 12(a1) +; CHECK-NEXT: lw a4, 12(a2) +; CHECK-NEXT: bne a3, a4, .LBB0_6 ; CHECK-NEXT: # %bb.4: -; CHECK-NEXT: lui a1, %hi(x) -; CHECK-NEXT: addi a1, a1, %lo(x) ; CHECK-NEXT: lw a3, 16(a1) -; CHECK-NEXT: lui a2, %hi(check) -; CHECK-NEXT: addi a2, a2, %lo(check) ; CHECK-NEXT: lw a4, 16(a2) ; CHECK-NEXT: bne a3, a4, .LBB0_6 ; CHECK-NEXT: # %bb.5: From 765206e050453018e861637a08a4520f29238074 Mon Sep 17 00:00:00 2001 From: gulfemsavrun Date: Tue, 28 May 2024 15:06:11 -0700 Subject: [PATCH 024/230] [CodeGen] Hidden visibility for prof version var (#93496) This patch adds hidden visibility to the variable that is used by the single byte counters mode in source-based code coverage. --- clang/lib/CodeGen/CodeGenPGO.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 76704c4d7be4a4..db8e6f55302adc 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1340,7 +1340,7 @@ void CodeGenPGO::setProfileVersion(llvm::Module &M) { llvm::APInt(64, ProfileVersion)), VarName); - IRLevelVersionVariable->setVisibility(llvm::GlobalValue::DefaultVisibility); + IRLevelVersionVariable->setVisibility(llvm::GlobalValue::HiddenVisibility); llvm::Triple TT(M.getTargetTriple()); if (TT.supportsCOMDAT()) { IRLevelVersionVariable->setLinkage(llvm::GlobalValue::ExternalLinkage); From 067b4ccb4b5ab93ac2dc2243248a8934fa1f7ce3 Mon Sep 17 00:00:00 2001 From: Eric Date: Tue, 28 May 2024 15:19:04 -0700 Subject: [PATCH 025/230] Upstream libc++ buildbot restarter. (#93582) I've been running a cronjob on my local machine to restart preempted libc++ CI runs. This is bad and brittle. This upstreams a much better version of the restarter. It works by matching on check run annotations looking for mention of the machine being shutdown. If there are both preempted jobs and failing jobs, we don't restart the workflow. Maybe we should change that? --- .../restart-preempted-libcxx-jobs.yaml | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 .github/workflows/restart-preempted-libcxx-jobs.yaml diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml new file mode 100644 index 00000000000000..a71f2084182e5e --- /dev/null +++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml @@ -0,0 +1,109 @@ +name: Restart Preempted Libc++ Workflow + +# The libc++ builders run on preemptable VMs, which can be shutdown at any time. +# This workflow identifies when a workflow run was canceled due to the VM being preempted, +# and restarts the workflow run. + +# We identify a canceled workflow run by checking the annotations of the check runs in the check suite, +# which should contain the message "The runner has received a shutdown signal." + +# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow. + +on: + workflow_run: + workflows: + - "Build and Test libc\+\+" + types: + - failure + - canceled + +permissions: + contents: read + +jobs: + restart: + if: github.repository_owner == 'llvm' + name: "Restart Job" + permissions: + statuses: read + checks: read + actions: write + runs-on: ubuntu-latest + steps: + - name: "Restart Job" + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1 + with: + script: | + const failure_regex = /Process completed with exit code 1./ + const preemption_regex = /The runner has received a shutdown signal/ + + console.log('Listing check runs for suite') + const check_suites = await github.rest.checks.listForSuite({ + owner: context.repo.owner, + repo: context.repo.repo, + check_suite_id: context.payload.workflow_run.check_suite_id + }) + + check_run_ids = []; + for (check_run of check_suites.data.check_runs) { + console.log('Checking check run: ' + check_run.id); + console.log(check_run); + if (check_run.status != 'completed') { + console.log('Check run was not completed. Skipping.'); + continue; + } + if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') { + console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.'); + continue; + } + check_run_ids.push(check_run.id); + } + + has_preempted_job = false; + + for (check_run_id of check_run_ids) { + console.log('Listing annotations for check run: ' + check_run_id); + + annotations = await github.rest.checks.listAnnotations({ + owner: context.repo.owner, + repo: context.repo.repo, + check_run_id: check_run_id + }) + + console.log(annotations); + for (annotation of annotations.data) { + if (annotation.annotation_level != 'failure') { + continue; + } + + const preemption_match = annotation.message.match(preemption_regex); + + if (preemption_match != null) { + console.log('Found preemption message: ' + annotation.message); + has_preempted_job = true; + } + + const failure_match = annotation.message.match(failure_regex); + if (failure_match != null) { + // We only want to restart the workflow if all of the failures were due to preemption. + // We don't want to restart the workflow if there were other failures. + console.log('Choosing not to rerun workflow because we found a non-preemption failure'); + console.log('Failure message: ' + annotation.message); + return; + } + } + } + + if (!has_preempted_job) { + console.log('No preempted jobs found. Not restarting workflow.'); + return; + } + + console.log("Restarted workflow: " + context.payload.workflow_run.id); + await github.rest.actions.reRunWorkflowFailedJobs({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: context.payload.workflow_run.id + }) + + From b9cdea66b62e2eb91814ef7c57ea01aa27440e72 Mon Sep 17 00:00:00 2001 From: Eric Fiselier Date: Tue, 28 May 2024 18:23:14 -0400 Subject: [PATCH 026/230] Attempt to fix issue with plus sign in libc++ workflow name --- .github/workflows/restart-preempted-libcxx-jobs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml index a71f2084182e5e..5682b0a4f52c3d 100644 --- a/.github/workflows/restart-preempted-libcxx-jobs.yaml +++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml @@ -12,7 +12,7 @@ name: Restart Preempted Libc++ Workflow on: workflow_run: workflows: - - "Build and Test libc\+\+" + - Build and Test libc\+\+ types: - failure - canceled From 6aeea700df6f3f8db9e6a79be4aa593c6fcc7d18 Mon Sep 17 00:00:00 2001 From: Spenser Bauman Date: Tue, 28 May 2024 18:29:17 -0400 Subject: [PATCH 027/230] [mlir][dataflow] Fix for integer range analysis propagation bug (#93199) Integer range analysis will not update the range of an operation when any of the inferred input lattices are uninitialized. In the current behavior, all lattice values for non integer types are uninitialized. For operations like arith.cmpf ```mlir %3 = arith.cmpf ugt, %arg0, %arg1 : f32 ``` that will result in the range of the output also being uninitialized, and so on for any consumer of the arith.cmpf result. When control-flow ops are involved, the lack of propagation results in incorrect ranges, as the back edges for loop carried values are not properly joined with the definitions from the body region. For example, an scf.while loop whose body region produces a value that is in a dataflow relationship with some floating-point values through an arith.cmpf operation: ```mlir func.func @test_bad_range(%arg0: f32, %arg1: f32) -> (index, index) { %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %3 = arith.cmpf ugt, %arg0, %arg1 : f32 %1:2 = scf.while (%arg2 = %c0, %arg3 = %c0) : (index, index) -> (index, index) { %2 = arith.cmpi ult, %arg2, %c4 : index scf.condition(%2) %arg2, %arg3 : index, index } do { ^bb0(%arg2: index, %arg3: index): %4 = arith.select %3, %arg3, %arg3 : index %5 = arith.addi %arg2, %c1 : index scf.yield %5, %4 : index, index } return %1#0, %1#1 : index, index } ``` The existing behavior results in the control condition %2 being optimized to true, turning the while loop into an infinite loop. The update to %arg2 through the body region is never factored into the range calculation, as the ranges for the body ops all test as uninitialized. This change causes all values initialized with setToEntryState to be set to some initialized range, even if the values are not integers. --------- Co-authored-by: Spenser Bauman --- .../Analysis/DataFlow/IntegerRangeAnalysis.h | 45 ----------- .../include/mlir/Dialect/Arith/IR/ArithOps.td | 16 ++-- mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 12 +-- .../include/mlir/Dialect/Index/IR/IndexOps.td | 2 +- .../mlir/Interfaces/InferIntRangeInterface.h | 75 ++++++++++++++++++- .../mlir/Interfaces/InferIntRangeInterface.td | 46 +++++++++--- .../Interfaces/Utils/InferIntRangeCommon.h | 8 +- .../DataFlow/IntegerRangeAnalysis.cpp | 51 ++++--------- .../Arith/IR/InferIntRangeInterfaceImpls.cpp | 18 +++-- .../lib/Interfaces/InferIntRangeInterface.cpp | 48 ++++++++++++ .../Interfaces/Utils/InferIntRangeCommon.cpp | 2 +- .../Dialect/Arith/int-range-interface.mlir | 19 +++++ mlir/test/lib/Dialect/Test/TestOps.td | 9 ++- 13 files changed, 230 insertions(+), 121 deletions(-) diff --git a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h index 8bd7cf880c6afb..191c023fb642cb 100644 --- a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h @@ -24,51 +24,6 @@ namespace mlir { namespace dataflow { -/// This lattice value represents the integer range of an SSA value. -class IntegerValueRange { -public: - /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)]) - /// range that is used to mark the value as unable to be analyzed further, - /// where `t` is the type of `value`. - static IntegerValueRange getMaxRange(Value value); - - /// Create an integer value range lattice value. - IntegerValueRange(std::optional value = std::nullopt) - : value(std::move(value)) {} - - /// Whether the range is uninitialized. This happens when the state hasn't - /// been set during the analysis. - bool isUninitialized() const { return !value.has_value(); } - - /// Get the known integer value range. - const ConstantIntRanges &getValue() const { - assert(!isUninitialized()); - return *value; - } - - /// Compare two ranges. - bool operator==(const IntegerValueRange &rhs) const { - return value == rhs.value; - } - - /// Take the union of two ranges. - static IntegerValueRange join(const IntegerValueRange &lhs, - const IntegerValueRange &rhs) { - if (lhs.isUninitialized()) - return rhs; - if (rhs.isUninitialized()) - return lhs; - return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())}; - } - - /// Print the integer value range. - void print(raw_ostream &os) const { os << value; } - -private: - /// The known integer value range. - std::optional value; -}; - /// This lattice element represents the integer value range of an SSA value. /// When this lattice is updated, it automatically updates the constant value /// of the SSA value (if the range can be narrowed to one). diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td index ead52332e8eec3..46248dad3be9e0 100644 --- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td +++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td @@ -49,7 +49,7 @@ class Arith_BinaryOp traits = []> : // Base class for integer binary operations. class Arith_IntBinaryOp traits = []> : Arith_BinaryOp]>, + [DeclareOpInterfaceMethods]>, Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs)>, Results<(outs SignlessIntegerLike:$result)>; @@ -107,7 +107,7 @@ class Arith_IToICastOp traits = []> : Arith_CastOp]>; + [DeclareOpInterfaceMethods]>; // Cast from an integer type to a floating point type. class Arith_IToFCastOp traits = []> : Arith_CastOp; @@ -139,7 +139,7 @@ class Arith_CompareOpOfAnyRank traits = []> : class Arith_IntBinaryOpWithOverflowFlags traits = []> : Arith_BinaryOp, + [Pure, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]>, Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs, DefaultValuedAttr< @@ -159,7 +159,7 @@ def Arith_ConstantOp : Op, AllTypesMatch<["value", "result"]>, - DeclareOpInterfaceMethods]> { + DeclareOpInterfaceMethods]> { let summary = "integer or floating point constant"; let description = [{ The `constant` operation produces an SSA value equal to some integer or @@ -1327,7 +1327,7 @@ def IndexCastTypeConstraint : TypeConstraint]> { + [DeclareOpInterfaceMethods]> { let summary = "cast between index and integer types"; let description = [{ Casts between scalar or vector integers and corresponding 'index' scalar or @@ -1346,7 +1346,7 @@ def Arith_IndexCastOp def Arith_IndexCastUIOp : Arith_CastOp<"index_castui", IndexCastTypeConstraint, IndexCastTypeConstraint, - [DeclareOpInterfaceMethods]> { + [DeclareOpInterfaceMethods]> { let summary = "unsigned cast between index and integer types"; let description = [{ Casts between scalar or vector integers and corresponding 'index' scalar or @@ -1400,7 +1400,7 @@ def Arith_BitcastOp : Arith_CastOp<"bitcast", BitcastTypeConstraint, def Arith_CmpIOp : Arith_CompareOpOfAnyRank<"cmpi", - [DeclareOpInterfaceMethods]> { + [DeclareOpInterfaceMethods]> { let summary = "integer comparison operation"; let description = [{ The `cmpi` operation is a generic comparison for integer-like types. Its two @@ -1555,7 +1555,7 @@ class ScalarConditionOrMatchingShape names> : def SelectOp : Arith_Op<"select", [Pure, AllTypesMatch<["true_value", "false_value", "result"]>, ScalarConditionOrMatchingShape<["condition", "result"]>, - DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, ] # ElementwiseMappable.traits> { let summary = "select operation"; let description = [{ diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 1da68ed2176d8f..10719aae5c8b46 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -52,7 +52,7 @@ def GPU_DimensionAttr : EnumAttr; class GPU_IndexOp traits = []> : GPU_Op, + DeclareOpInterfaceMethods, DeclareOpInterfaceMethods])>, Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> { let assemblyFormat = "$dimension attr-dict"; @@ -144,7 +144,7 @@ def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> { } def GPU_LaneIdOp : GPU_Op<"lane_id", [ - Pure, DeclareOpInterfaceMethods]> { + Pure, DeclareOpInterfaceMethods]> { let description = [{ Returns the lane id within the subgroup (warp/wave). @@ -158,7 +158,7 @@ def GPU_LaneIdOp : GPU_Op<"lane_id", [ } def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [ - Pure, DeclareOpInterfaceMethods]>, + Pure, DeclareOpInterfaceMethods]>, Arguments<(ins)>, Results<(outs Index:$result)> { let description = [{ Returns the subgroup id, i.e., the index of the current subgroup within the @@ -190,7 +190,7 @@ def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> { def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [ - Pure, DeclareOpInterfaceMethods]>, + Pure, DeclareOpInterfaceMethods]>, Arguments<(ins)>, Results<(outs Index:$result)> { let description = [{ Returns the number of subgroups within a workgroup. @@ -206,7 +206,7 @@ def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [ } def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [ - Pure, DeclareOpInterfaceMethods]>, + Pure, DeclareOpInterfaceMethods]>, Arguments<(ins)>, Results<(outs Index:$result)> { let description = [{ Returns the number of threads within a subgroup. @@ -687,7 +687,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [ def GPU_LaunchOp : GPU_Op<"launch", [ AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface, - DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, RecursiveMemoryEffects]>, Arguments<(ins Variadic:$asyncDependencies, Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, diff --git a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td index c6079cb8a98c81..a30ae9f739cbc6 100644 --- a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td +++ b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td @@ -25,7 +25,7 @@ include "mlir/IR/OpBase.td" /// Base class for Index dialect operations. class IndexOp traits = []> : Op] # traits>; + [DeclareOpInterfaceMethods] # traits>; //===----------------------------------------------------------------------===// // IndexBinaryOp diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h index 05064a72ef02e7..0e107e88f5232f 100644 --- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h +++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h @@ -105,10 +105,83 @@ class ConstantIntRanges { raw_ostream &operator<<(raw_ostream &, const ConstantIntRanges &); +/// This lattice value represents the integer range of an SSA value. +class IntegerValueRange { +public: + /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)]) + /// range that is used to mark the value as unable to be analyzed further, + /// where `t` is the type of `value`. + static IntegerValueRange getMaxRange(Value value); + + /// Create an integer value range lattice value. + IntegerValueRange(ConstantIntRanges value) : value(std::move(value)) {} + + /// Create an integer value range lattice value. + IntegerValueRange(std::optional value = std::nullopt) + : value(std::move(value)) {} + + /// Whether the range is uninitialized. This happens when the state hasn't + /// been set during the analysis. + bool isUninitialized() const { return !value.has_value(); } + + /// Get the known integer value range. + const ConstantIntRanges &getValue() const { + assert(!isUninitialized()); + return *value; + } + + /// Compare two ranges. + bool operator==(const IntegerValueRange &rhs) const { + return value == rhs.value; + } + + /// Compute the least upper bound of two ranges. + static IntegerValueRange join(const IntegerValueRange &lhs, + const IntegerValueRange &rhs) { + if (lhs.isUninitialized()) + return rhs; + if (rhs.isUninitialized()) + return lhs; + return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())}; + } + + /// Print the integer value range. + void print(raw_ostream &os) const { os << value; } + +private: + /// The known integer value range. + std::optional value; +}; + +raw_ostream &operator<<(raw_ostream &, const IntegerValueRange &); + /// The type of the `setResultRanges` callback provided to ops implementing /// InferIntRangeInterface. It should be called once for each integer result /// value and be passed the ConstantIntRanges corresponding to that value. -using SetIntRangeFn = function_ref; +using SetIntRangeFn = + llvm::function_ref; + +/// Similar to SetIntRangeFn, but operating on IntegerValueRange lattice values. +/// This is the `setResultRanges` callback for the IntegerValueRange based +/// interface method. +using SetIntLatticeFn = + llvm::function_ref; + +class InferIntRangeInterface; + +namespace intrange::detail { +/// Default implementation of `inferResultRanges` which dispatches to the +/// `inferResultRangesFromOptional`. +void defaultInferResultRanges(InferIntRangeInterface interface, + ArrayRef argRanges, + SetIntLatticeFn setResultRanges); + +/// Default implementation of `inferResultRangesFromOptional` which dispatches +/// to the `inferResultRanges`. +void defaultInferResultRangesFromOptional(InferIntRangeInterface interface, + ArrayRef argRanges, + SetIntRangeFn setResultRanges); +} // end namespace intrange::detail } // end namespace mlir #include "mlir/Interfaces/InferIntRangeInterface.h.inc" diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td index dbdc526c6f10b6..6ee436ce4d6c2f 100644 --- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td +++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td @@ -28,9 +28,10 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> { Infer the bounds on the results of this op given the bounds on its arguments. For each result value or block argument (that isn't a branch argument, since the dataflow analysis handles those case), the method should call - `setValueRange` with that `Value` as an argument. When `setValueRange` - is not called for some value, it will recieve a default value of the mimimum - and maximum values for its type (the unbounded range). + `setValueRange` with that `Value` as an argument. When implemented, + `setValueRange` should be called on all result values for the operation. + When operations take non-integer inputs, the + `inferResultRangesFromOptional` method should be implemented instead. When called on an op that also implements the RegionBranchOpInterface or BranchOpInterface, this method should not attempt to infer the values @@ -39,14 +40,39 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> { This function will only be called when at least one result of the op is a scalar integer value or the op has a region. + }], + /*retTy=*/"void", + /*methodName=*/"inferResultRanges", + /*args=*/(ins "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges, + "::mlir::SetIntRangeFn":$setResultRanges), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + ::mlir::intrange::detail::defaultInferResultRangesFromOptional($_op, + argRanges, + setResultRanges); + }]>, + + InterfaceMethod<[{ + Infer the bounds on the results of this op given the lattice representation + of the bounds for its arguments. For each result value or block argument + (that isn't a branch argument, since the dataflow analysis handles + those case), the method should call `setValueRange` with that `Value` + as an argument. When implemented, `setValueRange` should be called on + all result values for the operation. - `argRanges` contains one `IntRangeAttrs` for each argument to the op in ODS - order. Non-integer arguments will have the an unbounded range of width-0 - APInts in their `argRanges` element. + This method allows for more precise implementations when operations + want to reason about inputs which may be undefined during the analysis. }], - "void", "inferResultRanges", (ins - "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges, - "::mlir::SetIntRangeFn":$setResultRanges) - >]; + /*retTy=*/"void", + /*methodName=*/"inferResultRangesFromOptional", + /*args=*/(ins "::llvm::ArrayRef<::mlir::IntegerValueRange>":$argRanges, + "::mlir::SetIntLatticeFn":$setResultRanges), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + ::mlir::intrange::detail::defaultInferResultRanges($_op, + argRanges, + setResultRanges); + }]> + ]; } #endif // MLIR_INTERFACES_INFERINTRANGEINTERFACE diff --git a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h index 851bb534bc7ee1..3988a8826498a9 100644 --- a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h +++ b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h @@ -25,7 +25,11 @@ namespace intrange { /// abstracted away here to permit writing the function that handles both /// 64- and 32-bit index types. using InferRangeFn = - function_ref)>; + std::function)>; + +/// Function that performs inferrence on an array of `IntegerValueRange`. +using InferIntegerValueRangeFn = + std::function)>; static constexpr unsigned indexMinWidth = 32; static constexpr unsigned indexMaxWidth = 64; @@ -52,7 +56,7 @@ using InferRangeWithOvfFlagsFn = /// /// The `mode` argument specifies if the unsigned, signed, or both results of /// the inference computation should be used when comparing the results. -ConstantIntRanges inferIndexOp(InferRangeFn inferFn, +ConstantIntRanges inferIndexOp(const InferRangeFn &inferFn, ArrayRef argRanges, CmpMode mode); diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp index a82c30717e275b..9721620807a0f0 100644 --- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp @@ -36,17 +36,6 @@ using namespace mlir; using namespace mlir::dataflow; -IntegerValueRange IntegerValueRange::getMaxRange(Value value) { - unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType()); - if (width == 0) - return {}; - APInt umin = APInt::getMinValue(width); - APInt umax = APInt::getMaxValue(width); - APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin; - APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax; - return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}}; -} - void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const { Lattice::onUpdate(solver); @@ -72,24 +61,17 @@ void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const { void IntegerRangeAnalysis::visitOperation( Operation *op, ArrayRef operands, ArrayRef results) { - // If the lattice on any operand is unitialized, bail out. - if (llvm::any_of(operands, [](const IntegerValueRangeLattice *lattice) { - return lattice->getValue().isUninitialized(); - })) { - return; - } - auto inferrable = dyn_cast(op); if (!inferrable) return setAllToEntryStates(results); LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n"); - SmallVector argRanges( - llvm::map_range(operands, [](const IntegerValueRangeLattice *val) { - return val->getValue().getValue(); - })); + auto argRanges = llvm::map_to_vector( + operands, [](const IntegerValueRangeLattice *lattice) { + return lattice->getValue(); + }); - auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) { + auto joinCallback = [&](Value v, const IntegerValueRange &attrs) { auto result = dyn_cast(v); if (!result) return; @@ -99,7 +81,7 @@ void IntegerRangeAnalysis::visitOperation( IntegerValueRangeLattice *lattice = results[result.getResultNumber()]; IntegerValueRange oldRange = lattice->getValue(); - ChangeResult changed = lattice->join(IntegerValueRange{attrs}); + ChangeResult changed = lattice->join(attrs); // Catch loop results with loop variant bounds and conservatively make // them [-inf, inf] so we don't circle around infinitely often (because @@ -116,7 +98,7 @@ void IntegerRangeAnalysis::visitOperation( propagateIfChanged(lattice, changed); }; - inferrable.inferResultRanges(argRanges, joinCallback); + inferrable.inferResultRangesFromOptional(argRanges, joinCallback); } void IntegerRangeAnalysis::visitNonControlFlowArguments( @@ -124,17 +106,12 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( ArrayRef argLattices, unsigned firstIndex) { if (auto inferrable = dyn_cast(op)) { LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n"); - // If the lattice on any operand is unitialized, bail out. - if (llvm::any_of(op->getOperands(), [&](Value value) { - return getLatticeElementFor(op, value)->getValue().isUninitialized(); - })) - return; - SmallVector argRanges( - llvm::map_range(op->getOperands(), [&](Value value) { - return getLatticeElementFor(op, value)->getValue().getValue(); - })); - auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) { + auto argRanges = llvm::map_to_vector(op->getOperands(), [&](Value value) { + return getLatticeElementFor(op, value)->getValue(); + }); + + auto joinCallback = [&](Value v, const IntegerValueRange &attrs) { auto arg = dyn_cast(v); if (!arg) return; @@ -145,7 +122,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( IntegerValueRangeLattice *lattice = argLattices[arg.getArgNumber()]; IntegerValueRange oldRange = lattice->getValue(); - ChangeResult changed = lattice->join(IntegerValueRange{attrs}); + ChangeResult changed = lattice->join(attrs); // Catch loop results with loop variant bounds and conservatively make // them [-inf, inf] so we don't circle around infinitely often (because @@ -162,7 +139,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( propagateIfChanged(lattice, changed); }; - inferrable.inferResultRanges(argRanges, joinCallback); + inferrable.inferResultRangesFromOptional(argRanges, joinCallback); return; } diff --git a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp index fbe2ecab8adcaa..462044417b5fb8 100644 --- a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp +++ b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp @@ -295,18 +295,24 @@ void arith::CmpIOp::inferResultRanges(ArrayRef argRanges, // SelectOp //===----------------------------------------------------------------------===// -void arith::SelectOp::inferResultRanges(ArrayRef argRanges, - SetIntRangeFn setResultRange) { - std::optional mbCondVal = argRanges[0].getConstantValue(); +void arith::SelectOp::inferResultRangesFromOptional( + ArrayRef argRanges, SetIntLatticeFn setResultRange) { + std::optional mbCondVal = + argRanges[0].isUninitialized() + ? std::nullopt + : argRanges[0].getValue().getConstantValue(); + + const IntegerValueRange &trueCase = argRanges[1]; + const IntegerValueRange &falseCase = argRanges[2]; if (mbCondVal) { if (mbCondVal->isZero()) - setResultRange(getResult(), argRanges[2]); + setResultRange(getResult(), falseCase); else - setResultRange(getResult(), argRanges[1]); + setResultRange(getResult(), trueCase); return; } - setResultRange(getResult(), argRanges[1].rangeUnion(argRanges[2])); + setResultRange(getResult(), IntegerValueRange::join(trueCase, falseCase)); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Interfaces/InferIntRangeInterface.cpp b/mlir/lib/Interfaces/InferIntRangeInterface.cpp index b3f6c0ee3cc32d..d879b93586899b 100644 --- a/mlir/lib/Interfaces/InferIntRangeInterface.cpp +++ b/mlir/lib/Interfaces/InferIntRangeInterface.cpp @@ -126,3 +126,51 @@ raw_ostream &mlir::operator<<(raw_ostream &os, const ConstantIntRanges &range) { return os << "unsigned : [" << range.umin() << ", " << range.umax() << "] signed : [" << range.smin() << ", " << range.smax() << "]"; } + +IntegerValueRange IntegerValueRange::getMaxRange(Value value) { + unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType()); + if (width == 0) + return {}; + + APInt umin = APInt::getMinValue(width); + APInt umax = APInt::getMaxValue(width); + APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin; + APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax; + return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}}; +} + +raw_ostream &mlir::operator<<(raw_ostream &os, const IntegerValueRange &range) { + range.print(os); + return os; +} + +void mlir::intrange::detail::defaultInferResultRanges( + InferIntRangeInterface interface, ArrayRef argRanges, + SetIntLatticeFn setResultRanges) { + llvm::SmallVector unpacked; + unpacked.reserve(argRanges.size()); + + for (const IntegerValueRange &range : argRanges) { + if (range.isUninitialized()) + return; + unpacked.push_back(range.getValue()); + } + + interface.inferResultRanges( + unpacked, + [&setResultRanges](Value value, const ConstantIntRanges &argRanges) { + setResultRanges(value, IntegerValueRange{argRanges}); + }); +} + +void mlir::intrange::detail::defaultInferResultRangesFromOptional( + InferIntRangeInterface interface, ArrayRef argRanges, + SetIntRangeFn setResultRanges) { + auto ranges = llvm::to_vector_of(argRanges); + interface.inferResultRangesFromOptional( + ranges, + [&setResultRanges](Value value, const IntegerValueRange &argRanges) { + if (!argRanges.isUninitialized()) + setResultRanges(value, argRanges.getValue()); + }); +} diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp index fe1a67d6287386..5b8d35e7bd5197 100644 --- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp +++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp @@ -76,7 +76,7 @@ static ConstantIntRanges minMaxBy(ConstArithFn op, ArrayRef lhs, //===----------------------------------------------------------------------===// ConstantIntRanges -mlir::intrange::inferIndexOp(InferRangeFn inferFn, +mlir::intrange::inferIndexOp(const InferRangeFn &inferFn, ArrayRef argRanges, intrange::CmpMode mode) { ConstantIntRanges sixtyFour = inferFn(argRanges); diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir index 5b538197a0c117..60f0ab41afa48d 100644 --- a/mlir/test/Dialect/Arith/int-range-interface.mlir +++ b/mlir/test/Dialect/Arith/int-range-interface.mlir @@ -899,3 +899,22 @@ func.func @test_shl_i8_nowrap() -> i8 { %2 = test.reflect_bounds %1 : i8 return %2: i8 } + +/// A test case to ensure that the ranges for unsupported ops are initialized +/// properly to maxRange, rather than left uninitialized. +/// In this test case, the previous behavior would leave the ranges for %a and +/// %b uninitialized, resulting in arith.cmpf's range not being updated, even +/// though it has an integer valued result. + +// CHECK-LABEL: func @test_cmpf_propagates +// CHECK: test.reflect_bounds {smax = 2 : index, smin = 1 : index, umax = 2 : index, umin = 1 : index} +func.func @test_cmpf_propagates(%a: f32, %b: f32) -> index { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + %0 = arith.cmpf ueq, %a, %b : f32 + %1 = arith.select %0, %c1, %c2 : index + %2 = test.reflect_bounds %1 : index + func.return %2 : index +} + diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index 18324482153a54..9d7e0a7928ab8d 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -2750,7 +2750,7 @@ def TestGraphLoopOp : TEST_Op<"graph_loop", def InferIntRangeType : AnyTypeOf<[AnyInteger, Index]>; def TestWithBoundsOp : TEST_Op<"with_bounds", - [DeclareOpInterfaceMethods, + [DeclareOpInterfaceMethods, NoMemoryEffect]> { let arguments = (ins APIntAttr:$umin, APIntAttr:$umax, @@ -2762,7 +2762,7 @@ def TestWithBoundsOp : TEST_Op<"with_bounds", } def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region", - [DeclareOpInterfaceMethods, + [DeclareOpInterfaceMethods, SingleBlock, NoTerminator]> { let arguments = (ins APIntAttr:$umin, APIntAttr:$umax, @@ -2774,7 +2774,7 @@ def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region", } def TestIncrementOp : TEST_Op<"increment", - [DeclareOpInterfaceMethods, + [DeclareOpInterfaceMethods, NoMemoryEffect, AllTypesMatch<["value", "result"]>]> { let arguments = (ins InferIntRangeType:$value); let results = (outs InferIntRangeType:$result); @@ -2783,7 +2783,8 @@ def TestIncrementOp : TEST_Op<"increment", } def TestReflectBoundsOp : TEST_Op<"reflect_bounds", - [DeclareOpInterfaceMethods, AllTypesMatch<["value", "result"]>]> { + [DeclareOpInterfaceMethods, + AllTypesMatch<["value", "result"]>]> { let arguments = (ins InferIntRangeType:$value, OptionalAttr:$umin, OptionalAttr:$umax, From 20d497c26fc95c80a1bacb38820d92e5f52bec58 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 28 May 2024 15:33:59 -0700 Subject: [PATCH 028/230] [Driver] Remove unneeded *-linux-gnu after D158183 Recommit 435ea21c897f94b5a3777a9f152e4c5bb4a371a3. As the comment added by a07727199db0525e9d2df41e466a2a1611b3c8e1 suggests, these `*Triples` lists should shrink over time. https://reviews.llvm.org/D158183 allows *-unknown-linux-gnu to detect *-linux-gnu. If we additionally allow x86_64-unknown-linux-gnu -m32/-mx32 to detect x86_64-linux-gnu, we can mostly remove these *-linux-gnu elements. Retain x86_64-linux-gnu for now to work around #93609. (In addition, Debian /usr/bin/clang --version uses x86_64-pc-linux-gnu). Retain i586-linux-gnu for now to work around #93502. --- clang/lib/Driver/ToolChains/Gnu.cpp | 69 ++++++++++++++--------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 9849c59685cca7..b141e5f2adfab1 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2227,10 +2227,19 @@ void Generic_GCC::GCCInstallationDetector::init( SmallVector CandidateBiarchTripleAliases; // Add some triples that we want to check first. CandidateTripleAliases.push_back(TargetTriple.str()); - std::string TripleNoVendor = TargetTriple.getArchName().str() + "-" + - TargetTriple.getOSAndEnvironmentName().str(); - if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor) + std::string TripleNoVendor, BiarchTripleNoVendor; + if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor) { + StringRef OSEnv = TargetTriple.getOSAndEnvironmentName(); + if (TargetTriple.getEnvironment() == llvm::Triple::GNUX32) + OSEnv = "linux-gnu"; + TripleNoVendor = (TargetTriple.getArchName().str() + '-' + OSEnv).str(); CandidateTripleAliases.push_back(TripleNoVendor); + if (BiarchVariantTriple.getArch() != llvm::Triple::UnknownArch) { + BiarchTripleNoVendor = + (BiarchVariantTriple.getArchName().str() + '-' + OSEnv).str(); + CandidateBiarchTripleAliases.push_back(BiarchTripleNoVendor); + } + } CollectLibDirsAndTriples(TargetTriple, BiarchVariantTriple, CandidateLibDirs, CandidateTripleAliases, CandidateBiarchLibDirs, @@ -2453,11 +2462,9 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes( // lists should shrink over time. Please don't add more elements to *Triples. static const char *const AArch64LibDirs[] = {"/lib64", "/lib"}; static const char *const AArch64Triples[] = { - "aarch64-none-linux-gnu", "aarch64-linux-gnu", "aarch64-redhat-linux", - "aarch64-suse-linux"}; + "aarch64-none-linux-gnu", "aarch64-redhat-linux", "aarch64-suse-linux"}; static const char *const AArch64beLibDirs[] = {"/lib"}; - static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu", - "aarch64_be-linux-gnu"}; + static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu"}; static const char *const ARMLibDirs[] = {"/lib"}; static const char *const ARMTriples[] = {"arm-linux-gnueabi"}; @@ -2482,9 +2489,8 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes( "x86_64-linux-gnu", "x86_64-unknown-linux-gnu", "x86_64-pc-linux-gnu", "x86_64-redhat-linux6E", "x86_64-redhat-linux", "x86_64-suse-linux", - "x86_64-manbo-linux-gnu", "x86_64-linux-gnu", - "x86_64-slackware-linux", "x86_64-unknown-linux", - "x86_64-amazon-linux"}; + "x86_64-manbo-linux-gnu", "x86_64-slackware-linux", + "x86_64-unknown-linux", "x86_64-amazon-linux"}; static const char *const X32Triples[] = {"x86_64-linux-gnux32", "x86_64-pc-linux-gnux32"}; static const char *const X32LibDirs[] = {"/libx32", "/lib"}; @@ -2500,26 +2506,24 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes( "loongarch64-linux-gnu", "loongarch64-unknown-linux-gnu"}; static const char *const M68kLibDirs[] = {"/lib"}; - static const char *const M68kTriples[] = { - "m68k-linux-gnu", "m68k-unknown-linux-gnu", "m68k-suse-linux"}; + static const char *const M68kTriples[] = {"m68k-unknown-linux-gnu", + "m68k-suse-linux"}; static const char *const MIPSLibDirs[] = {"/libo32", "/lib"}; static const char *const MIPSTriples[] = { "mips-linux-gnu", "mips-mti-linux", "mips-mti-linux-gnu", "mips-img-linux-gnu", "mipsisa32r6-linux-gnu"}; static const char *const MIPSELLibDirs[] = {"/libo32", "/lib"}; - static const char *const MIPSELTriples[] = { - "mipsel-linux-gnu", "mips-img-linux-gnu", "mipsisa32r6el-linux-gnu"}; + static const char *const MIPSELTriples[] = {"mipsel-linux-gnu", + "mips-img-linux-gnu"}; static const char *const MIPS64LibDirs[] = {"/lib64", "/lib"}; static const char *const MIPS64Triples[] = { - "mips64-linux-gnu", "mips-mti-linux-gnu", - "mips-img-linux-gnu", "mips64-linux-gnuabi64", + "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64-linux-gnuabi64", "mipsisa64r6-linux-gnu", "mipsisa64r6-linux-gnuabi64"}; static const char *const MIPS64ELLibDirs[] = {"/lib64", "/lib"}; static const char *const MIPS64ELTriples[] = { - "mips64el-linux-gnu", "mips-mti-linux-gnu", - "mips-img-linux-gnu", "mips64el-linux-gnuabi64", + "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64el-linux-gnuabi64", "mipsisa64r6el-linux-gnu", "mipsisa64r6el-linux-gnuabi64"}; static const char *const MIPSN32LibDirs[] = {"/lib32"}; @@ -2534,46 +2538,39 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes( static const char *const PPCLibDirs[] = {"/lib32", "/lib"}; static const char *const PPCTriples[] = { - "powerpc-linux-gnu", "powerpc-unknown-linux-gnu", "powerpc-linux-gnuspe", + "powerpc-unknown-linux-gnu", // On 32-bit PowerPC systems running SUSE Linux, gcc is configured as a // 64-bit compiler which defaults to "-m32", hence "powerpc64-suse-linux". "powerpc64-suse-linux", "powerpc-montavista-linuxspe"}; static const char *const PPCLELibDirs[] = {"/lib32", "/lib"}; - static const char *const PPCLETriples[] = {"powerpcle-linux-gnu", - "powerpcle-unknown-linux-gnu", + static const char *const PPCLETriples[] = {"powerpcle-unknown-linux-gnu", "powerpcle-linux-musl"}; static const char *const PPC64LibDirs[] = {"/lib64", "/lib"}; - static const char *const PPC64Triples[] = { - "powerpc64-linux-gnu", "powerpc64-unknown-linux-gnu", - "powerpc64-suse-linux", "ppc64-redhat-linux"}; + static const char *const PPC64Triples[] = {"powerpc64-unknown-linux-gnu", + "powerpc64-suse-linux", + "ppc64-redhat-linux"}; static const char *const PPC64LELibDirs[] = {"/lib64", "/lib"}; static const char *const PPC64LETriples[] = { - "powerpc64le-linux-gnu", "powerpc64le-unknown-linux-gnu", - "powerpc64le-none-linux-gnu", "powerpc64le-suse-linux", - "ppc64le-redhat-linux"}; + "powerpc64le-unknown-linux-gnu", "powerpc64le-none-linux-gnu", + "powerpc64le-suse-linux", "ppc64le-redhat-linux"}; static const char *const RISCV32LibDirs[] = {"/lib32", "/lib"}; static const char *const RISCV32Triples[] = {"riscv32-unknown-linux-gnu", - "riscv32-linux-gnu", "riscv32-unknown-elf"}; static const char *const RISCV64LibDirs[] = {"/lib64", "/lib"}; static const char *const RISCV64Triples[] = {"riscv64-unknown-linux-gnu", - "riscv64-linux-gnu", "riscv64-unknown-elf"}; static const char *const SPARCv8LibDirs[] = {"/lib32", "/lib"}; - static const char *const SPARCv8Triples[] = {"sparc-linux-gnu", - "sparcv8-linux-gnu"}; + static const char *const SPARCv8Triples[] = {"sparcv8-linux-gnu"}; static const char *const SPARCv9LibDirs[] = {"/lib64", "/lib"}; - static const char *const SPARCv9Triples[] = {"sparc64-linux-gnu", - "sparcv9-linux-gnu"}; + static const char *const SPARCv9Triples[] = {"sparcv9-linux-gnu"}; static const char *const SystemZLibDirs[] = {"/lib64", "/lib"}; static const char *const SystemZTriples[] = { - "s390x-linux-gnu", "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu", - "s390x-suse-linux", "s390x-redhat-linux"}; - + "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu", "s390x-suse-linux", + "s390x-redhat-linux"}; using std::begin; using std::end; From 760c2aa55f0c5f56bed944328b23aa3f2f764346 Mon Sep 17 00:00:00 2001 From: PiJoules <6019989+PiJoules@users.noreply.github.com> Date: Tue, 28 May 2024 15:37:03 -0700 Subject: [PATCH 029/230] [lld] Support thumb PLTs (#86223) We are using PLTs for cortex-m33 which only supports thumb. More specifically, this is for a very restricted use case. There's no MMU so there's no sharing of virtual addresses between two processes, but this is fine. The MCU is used for running [chre nanoapps](https://android.googlesource.com/platform/system/chre/+/HEAD/doc/nanoapp_overview.md) for android. Each nanoapp is a shared library (but effectively acts as an executable containing a test suite) that is loaded and run on the MCU one binary at a time and there's only one process running at a time, so we ensure that the same text segment cannot be shared by two different running executables. GNU LD supports thumb PLTs but we want to migrate to a clang toolchain and use LLD, so thumb PLTs are needed. --- lld/ELF/Arch/ARM.cpp | 176 +++++++++++++++++++-------- lld/ELF/Config.h | 1 + lld/ELF/InputFiles.cpp | 12 ++ lld/test/ELF/armv8-thumb-plt-reloc.s | 126 +++++++++++++++++++ 4 files changed, 262 insertions(+), 53 deletions(-) create mode 100644 lld/test/ELF/armv8-thumb-plt-reloc.s diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 687f9499009d5e..3e0efe540e1bf1 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -231,36 +231,71 @@ static void writePltHeaderLong(uint8_t *buf) { // The default PLT header requires the .got.plt to be within 128 Mb of the // .plt in the positive direction. void ARM::writePltHeader(uint8_t *buf) const { - // Use a similar sequence to that in writePlt(), the difference is the calling - // conventions mean we use lr instead of ip. The PLT entry is responsible for - // saving lr on the stack, the dynamic loader is responsible for reloading - // it. - const uint32_t pltData[] = { - 0xe52de004, // L1: str lr, [sp,#-4]! - 0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4) - 0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4) - 0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4) - }; - - uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4; - if (!llvm::isUInt<27>(offset)) { - // We cannot encode the Offset, use the long form. - writePltHeaderLong(buf); - return; + if (config->armThumbPLTs) { + // The instruction sequence for thumb: + // + // 0: b500 push {lr} + // 2: f8df e008 ldr.w lr, [pc, #0x8] @ 0xe + // 6: 44fe add lr, pc + // 8: f85e ff08 ldr pc, [lr, #8]! + // e: .word .got.plt - .plt - 16 + // + // At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from + // `pc` in the add instruction and 8 bytes for the `lr` adjustment. + // + uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16; + assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset"); + write16(buf + 0, 0xb500); + // Split into two halves to support endianness correctly. + write16(buf + 2, 0xf8df); + write16(buf + 4, 0xe008); + write16(buf + 6, 0x44fe); + // Split into two halves to support endianness correctly. + write16(buf + 8, 0xf85e); + write16(buf + 10, 0xff08); + write32(buf + 12, offset); + + memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary + memcpy(buf + 20, trapInstr.data(), 4); + memcpy(buf + 24, trapInstr.data(), 4); + memcpy(buf + 28, trapInstr.data(), 4); + } else { + // Use a similar sequence to that in writePlt(), the difference is the + // calling conventions mean we use lr instead of ip. The PLT entry is + // responsible for saving lr on the stack, the dynamic loader is responsible + // for reloading it. + const uint32_t pltData[] = { + 0xe52de004, // L1: str lr, [sp,#-4]! + 0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4) + 0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4) + 0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4) + }; + + uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4; + if (!llvm::isUInt<27>(offset)) { + // We cannot encode the Offset, use the long form. + writePltHeaderLong(buf); + return; + } + write32(buf + 0, pltData[0]); + write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff)); + write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff)); + write32(buf + 12, pltData[3] | (offset & 0xfff)); + memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary + memcpy(buf + 20, trapInstr.data(), 4); + memcpy(buf + 24, trapInstr.data(), 4); + memcpy(buf + 28, trapInstr.data(), 4); } - write32(buf + 0, pltData[0]); - write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff)); - write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff)); - write32(buf + 12, pltData[3] | (offset & 0xfff)); - memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary - memcpy(buf + 20, trapInstr.data(), 4); - memcpy(buf + 24, trapInstr.data(), 4); - memcpy(buf + 28, trapInstr.data(), 4); } void ARM::addPltHeaderSymbols(InputSection &isec) const { - addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec); - addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec); + if (config->armThumbPLTs) { + addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec); + addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec); + } else { + addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec); + addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec); + } } // Long form PLT entries that do not have any restrictions on the displacement @@ -279,32 +314,65 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr, // .plt in the positive direction. void ARM::writePlt(uint8_t *buf, const Symbol &sym, uint64_t pltEntryAddr) const { - // The PLT entry is similar to the example given in Appendix A of ELF for - // the Arm Architecture. Instead of using the Group Relocations to find the - // optimal rotation for the 8-bit immediate used in the add instructions we - // hard code the most compact rotations for simplicity. This saves a load - // instruction over the long plt sequences. - const uint32_t pltData[] = { - 0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8 - 0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8 - 0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8 - }; - uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8; - if (!llvm::isUInt<27>(offset)) { - // We cannot encode the Offset, use the long form. - writePltLong(buf, sym.getGotPltVA(), pltEntryAddr); - return; + if (!config->armThumbPLTs) { + uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8; + + // The PLT entry is similar to the example given in Appendix A of ELF for + // the Arm Architecture. Instead of using the Group Relocations to find the + // optimal rotation for the 8-bit immediate used in the add instructions we + // hard code the most compact rotations for simplicity. This saves a load + // instruction over the long plt sequences. + const uint32_t pltData[] = { + 0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8 + 0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8 + 0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8 + }; + if (!llvm::isUInt<27>(offset)) { + // We cannot encode the Offset, use the long form. + writePltLong(buf, sym.getGotPltVA(), pltEntryAddr); + return; + } + write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff)); + write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff)); + write32(buf + 8, pltData[2] | (offset & 0xfff)); + memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary + } else { + uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12; + assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset"); + + // A PLT entry will be: + // + // movw ip, # + // movt ip, # + // add ip, pc + // L1: ldr.w pc, [ip] + // b L1 + // + // where ip = r12 = 0xc + + // movw ip, # + write16(buf + 2, 0x0c00); // use `ip` + relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, offset); + + // movt ip, # + write16(buf + 6, 0x0c00); // use `ip` + relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, offset); + + write16(buf + 8, 0x44fc); // add ip, pc + write16(buf + 10, 0xf8dc); // ldr.w pc, [ip] (bottom half) + write16(buf + 12, 0xf000); // ldr.w pc, [ip] (upper half) + write16(buf + 14, 0xe7fc); // Branch to previous instruction } - write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff)); - write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff)); - write32(buf + 8, pltData[2] | (offset & 0xfff)); - memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary } void ARM::addPltSymbols(InputSection &isec, uint64_t off) const { - addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec); - addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec); + if (config->armThumbPLTs) { + addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec); + } else { + addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec); + addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec); + } } bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, @@ -325,6 +393,8 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, case R_ARM_JUMP24: // Source is ARM, all PLT entries are ARM so no interworking required. // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb). + assert(!config->armThumbPLTs && + "If the source is ARM, we should not need Thumb PLTs"); if (s.isFunc() && expr == R_PC && (s.getVA() & 1)) return true; [[fallthrough]]; @@ -335,9 +405,9 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, } case R_ARM_THM_JUMP19: case R_ARM_THM_JUMP24: - // Source is Thumb, all PLT entries are ARM so interworking is required. + // Source is Thumb, when all PLT entries are ARM interworking is required. // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM). - if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0)) + if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0)) return true; [[fallthrough]]; case R_ARM_THM_CALL: { @@ -547,7 +617,6 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { // STT_FUNC we choose whether to write a BL or BLX depending on the // value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is // not of type STT_FUNC then we must preserve the original instruction. - // PLT entries are always ARM state so we know we don't need to interwork. assert(rel.sym); // R_ARM_CALL is always reached via relocate(). bool bit0Thumb = val & 1; bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000; @@ -606,12 +675,13 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { // PLT entries are always ARM state so we know we need to interwork. assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate(). bool bit0Thumb = val & 1; + bool useThumb = bit0Thumb || config->armThumbPLTs; bool isBlx = (read16(loc + 2) & 0x1000) == 0; // lld 10.0 and before always used bit0Thumb when deciding to write a BLX - // even when type not STT_FUNC. PLT entries generated by LLD are always ARM. - if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb) + // even when type not STT_FUNC. + if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb) stateChangeWarning(loc, rel.type, *rel.sym); - if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) { + if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) { // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As // the BLX instruction may only be two byte aligned. This must be done // before overflow check. diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index f0dfe7f377de0e..883c4a2f84294c 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -217,6 +217,7 @@ struct Config { bool allowMultipleDefinition; bool fatLTOObjects; bool androidPackDynRelocs = false; + bool armThumbPLTs = false; bool armHasBlx = false; bool armHasMovtMovw = false; bool armJ1J2BranchEncoding = false; diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index 1f496026d3ae20..d760dddcf5ec5c 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -194,6 +194,18 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) { if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base && profile == ARMBuildAttrs::MicroControllerProfile) config->armCMSESupport = true; + + // The thumb PLT entries require Thumb2 which can be used on multiple archs. + // For now, let's limit it to ones where ARM isn't available and we know have + // Thumb2. + std::optional armISA = + attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use); + std::optional thumb = + attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use); + bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed; + bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32; + if (noArmISA && hasThumb2) + config->armThumbPLTs = true; } InputFile::InputFile(Kind k, MemoryBufferRef m) diff --git a/lld/test/ELF/armv8-thumb-plt-reloc.s b/lld/test/ELF/armv8-thumb-plt-reloc.s new file mode 100644 index 00000000000000..47cd5c1b741ee0 --- /dev/null +++ b/lld/test/ELF/armv8-thumb-plt-reloc.s @@ -0,0 +1,126 @@ +// REQUIRES: arm +// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1 +// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %s -o %t2 +// RUN: ld.lld %t1 %t2 -o %t +// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s +// RUN: ld.lld -shared %t1 %t2 -o %t.so +// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s +// RUN: llvm-readelf -S -r %t.so | FileCheck -check-prefix=DSOREL %s + +// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.be +// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %s -o %t2.be +// RUN: ld.lld %t1.be %t2.be -o %t.be +// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s +// RUN: ld.lld -shared %t1.be %t2.be -o %t.so.be +// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s +// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s + +// RUN: ld.lld --be8 %t1.be %t2.be -o %t.be +// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s +// RUN: ld.lld --be8 -shared %t1.be %t2.be -o %t.so.be +// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s +// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s + +/// Test PLT entry generation + .text + .align 2 + .globl _start + .type _start,%function +_start: + bl func1 + bl func2 + bl func3 + b.w func1 + b.w func2 + b.w func3 + beq.w func1 + beq.w func2 + beq.w func3 + +/// Executable, expect no PLT +// CHECK: Disassembly of section .text: +// CHECK-EMPTY: +// CHECK-NEXT: : +// CHECK-NEXT: bx lr +// CHECK: : +// CHECK-NEXT: bx lr +// CHECK: : +// CHECK-NEXT: bx lr +// CHECK-NEXT: d4d4 +// CHECK: <_start>: +// CHECK-NEXT: bl {{.*}} +// CHECK-NEXT: bl {{.*}} +// CHECK-NEXT: bl {{.*}} +// CHECK-NEXT: b.w {{.*}} +// CHECK-NEXT: b.w {{.*}} +// CHECK-NEXT: b.w {{.*}} +// CHECK-NEXT: beq.w {{.*}} +// CHECK-NEXT: beq.w {{.*}} +// CHECK-NEXT: beq.w {{.*}} + +// DSO: Disassembly of section .text: +// DSO-EMPTY: +// DSO-NEXT: : +// DSO-NEXT: bx lr +// DSO: : +// DSO-NEXT: bx lr +// DSO: : +// DSO-NEXT: bx lr +// DSO-NEXT: d4d4 +// DSO: <_start>: +/// 0x10260 = PLT func1 +// DSO-NEXT: bl 0x10260 +/// 0x10270 = PLT func2 +// DSO-NEXT: bl 0x10270 +/// 0x10280 = PLT func3 +// DSO-NEXT: bl 0x10280 +/// 0x10260 = PLT func1 +// DSO-NEXT: b.w 0x10260 +/// 0x10270 = PLT func2 +// DSO-NEXT: b.w 0x10270 +/// 0x10280 = PLT func3 +// DSO-NEXT: b.w 0x10280 +/// 0x10260 = PLT func1 +// DSO-NEXT: beq.w 0x10260 +/// 0x10270 = PLT func2 +// DSO-NEXT: beq.w 0x10270 +/// 0x10280 = PLT func3 +// DSO-NEXT: beq.w 0x10280 +// DSO: Disassembly of section .plt: +// DSO-EMPTY: +// DSO-NEXT: 10240 <.plt>: +// DSO-NEXT: push {lr} +// DSO-NEXT: ldr.w lr, [pc, #8] +// DSO-NEXT: add lr, pc +// DSO-NEXT: ldr pc, [lr, #8]! +/// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8 +// DSO-NEXT: .word 0x00020098 +// DSO-NEXT: .word 0xd4d4d4d4 +// DSO-NEXT: .word 0xd4d4d4d4 +// DSO-NEXT: .word 0xd4d4d4d4 +// DSO-NEXT: .word 0xd4d4d4d4 + +/// 136 + 2 << 16 + 0x1026c = 0x302f4 = got entry 1 +// DSO-NEXT: 10260: f240 0c88 movw r12, #136 +// DSO-NEXT: f2c0 0c02 movt r12, #2 +// DSO-NEXT: 44fc add r12, pc +// DSO-NEXT: f8dc f000 ldr.w pc, [r12] +// DSO-NEXT: e7fc b 0x1026a +/// 124 + 2 << 16 + 0x1027c = 0x302f8 = got entry 2 +// DSO-NEXT: 10270: f240 0c7c movw r12, #124 +// DSO-NEXT: f2c0 0c02 movt r12, #2 +// DSO-NEXT: 44fc add r12, pc +// DSO-NEXT: f8dc f000 ldr.w pc, [r12] +// DSO-NEXT: e7fc b 0x1027a +/// 112 + 2 << 16 + 0x1028c = 0x302fc = got entry 3 +// DSO-NEXT: 10280: f240 0c70 movw r12, #112 +// DSO-NEXT: f2c0 0c02 movt r12, #2 +// DSO-NEXT: 44fc add r12, pc +// DSO-NEXT: f8dc f000 ldr.w pc, [r12] +// DSO-NEXT: e7fc b 0x1028a + +// DSOREL: .got.plt PROGBITS 000302e8 {{.*}} 000018 00 WA 0 0 4 +// DSOREL: Relocation section '.rel.plt' +// DSOREL: 000302f4 {{.*}} R_ARM_JUMP_SLOT {{.*}} func1 +// DSOREL: 000302f8 {{.*}} R_ARM_JUMP_SLOT {{.*}} func2 +// DSOREL: 000302fc {{.*}} R_ARM_JUMP_SLOT {{.*}} func3 From f7c8a0339c64810a3c1b28d9b3b20e02a2be6232 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 15:54:44 -0700 Subject: [PATCH 030/230] [RISCV] Combine vXi32 (mul (and (lshr X, 15), 0x10001), 0xffff) -> (bitcast (sra (v2Xi16 (bitcast X)), 15)) (#93565) Similar for i16 and i64 elements for both fixed and scalable vectors. This reduces the number of vector instructions, but increases vl/vtype toggles. This reduces some code in 525.x264_r from SPEC2017. In that usage, the vectors are fixed with a small number of elements so vsetivli can be used. This is similar to `performMulVectorCmpZeroCombine` from AArch64. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 41 +++++++ llvm/test/CodeGen/RISCV/rvv/mul-combine.ll | 117 ++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/mul-combine.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 5fc613c1b2a140..e99c6208594e3b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13704,6 +13704,44 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Combine vXi32 (mul (and (lshr X, 15), 0x10001), 0xffff) -> +// (bitcast (sra (v2Xi16 (bitcast X)), 15)) +// Same for other equivalent types with other equivalent constants. +static SDValue combineVectorMulToSraBitcast(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Do this for legal vectors unless they are i1 or i8 vectors. + if (!VT.isVector() || !TLI.isTypeLegal(VT) || VT.getScalarSizeInBits() < 16) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::AND || + N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL) + return SDValue(); + + SDValue And = N->getOperand(0); + SDValue Srl = And.getOperand(0); + + APInt V1, V2, V3; + if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) || + !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) || + !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3)) + return SDValue(); + + unsigned HalfSize = VT.getScalarSizeInBits() / 2; + if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) || + V3 != (HalfSize - 1)) + return SDValue(); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), + EVT::getIntegerVT(*DAG.getContext(), HalfSize), + VT.getVectorElementCount() * 2); + SDLoc DL(N); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, HalfVT, Srl.getOperand(0)); + SDValue Sra = DAG.getNode(ISD::SRA, DL, HalfVT, Cast, + DAG.getConstant(HalfSize - 1, DL, HalfVT)); + return DAG.getNode(ISD::BITCAST, DL, VT, Sra); +} static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -13748,6 +13786,9 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineBinOpOfZExt(N, DAG)) return V; + if (SDValue V = combineVectorMulToSraBitcast(N, DAG)) + return V; + return SDValue(); } diff --git a/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll new file mode 100644 index 00000000000000..6a7da925b4d43d --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64 + +define <2 x i16> @test_v2i16(<2 x i16> %x) { +; CHECK-RV32-LABEL: test_v2i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 7 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_v2i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 7 +; CHECK-RV64-NEXT: ret + %1 = lshr <2 x i16> %x, + %2 = and <2 x i16> %1, + %3 = mul <2 x i16> %2, + ret <2 x i16> %3 +} + +define @test_nxv2i16( %x) { +; CHECK-RV32-LABEL: test_nxv2i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-RV32-NEXT: vsrl.vi v8, v8, 7 +; CHECK-RV32-NEXT: li a0, 257 +; CHECK-RV32-NEXT: vand.vx v8, v8, a0 +; CHECK-RV32-NEXT: vsll.vi v8, v8, 8 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_nxv2i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-RV64-NEXT: vsrl.vi v8, v8, 7 +; CHECK-RV64-NEXT: li a0, 257 +; CHECK-RV64-NEXT: vand.vx v8, v8, a0 +; CHECK-RV64-NEXT: vsll.vi v8, v8, 8 +; CHECK-RV64-NEXT: ret + %1 = lshr %x, splat (i16 7) + %2 = and %1, splat (i16 257) + %3 = mul %2, splat (i16 256) + ret %3 +} + +define <2 x i32> @test_v2i32(<2 x i32> %x) { +; CHECK-RV32-LABEL: test_v2i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 15 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_v2i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 15 +; CHECK-RV64-NEXT: ret + %1 = lshr <2 x i32> %x, + %2 = and <2 x i32> %1, + %3 = mul <2 x i32> %2, + ret <2 x i32> %3 +} + +define @test_nxv2i32( %x) { +; CHECK-RV32-LABEL: test_nxv2i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 15 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_nxv2i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 15 +; CHECK-RV64-NEXT: ret + %1 = lshr %x, splat (i32 15) + %2 = and %1, splat (i32 65537) + %3 = mul %2, splat (i32 65535) + ret %3 +} + +define <2 x i64> @test_v2i64(<2 x i64> %x) { +; CHECK-RV32-LABEL: test_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 31 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 31 +; CHECK-RV64-NEXT: ret + %1 = lshr <2 x i64> %x, + %2 = and <2 x i64> %1, + %3 = mul <2 x i64> %2, + ret <2 x i64> %3 +} + +define @test_nxv2i64( %x) { +; CHECK-RV32-LABEL: test_nxv2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 31 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_nxv2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 31 +; CHECK-RV64-NEXT: ret + %1 = lshr %x, splat (i64 31) + %2 = and %1, splat (i64 4294967297) + %3 = mul %2, splat (i64 4294967295) + ret %3 +} From 0694552cb7e8b2041fd5e765cf5b83fc40664087 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 28 May 2024 15:56:17 -0700 Subject: [PATCH 031/230] [libc] clean up MutexLock (#93619) --- libc/src/__support/threads/linux/CMakeLists.txt | 1 + libc/src/__support/threads/linux/CndVar.cpp | 7 ++++--- libc/src/__support/threads/mutex.h | 14 -------------- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt index 39c4ad20201ca6..f6913ef0834289 100644 --- a/libc/src/__support/threads/linux/CMakeLists.txt +++ b/libc/src/__support/threads/linux/CMakeLists.txt @@ -75,4 +75,5 @@ add_object_library( libc.src.__support.OSUtil.osutil libc.src.__support.threads.linux.futex_word_type libc.src.__support.threads.mutex + libc.src.__support.CPP.mutex ) diff --git a/libc/src/__support/threads/linux/CndVar.cpp b/libc/src/__support/threads/linux/CndVar.cpp index daf56bca1ed21b..b3a0fdbda4e9ea 100644 --- a/libc/src/__support/threads/linux/CndVar.cpp +++ b/libc/src/__support/threads/linux/CndVar.cpp @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/__support/threads/CndVar.h" +#include "src/__support/CPP/mutex.h" #include "src/__support/OSUtil/syscall.h" // syscall_impl #include "src/__support/threads/linux/futex_word.h" // FutexWordType -#include "src/__support/threads/mutex.h" // Mutex, MutexLock +#include "src/__support/threads/mutex.h" // Mutex #include // For syscall numbers. @@ -27,7 +28,7 @@ int CndVar::wait(Mutex *m) { CndWaiter waiter; { - MutexLock ml(&qmtx); + cpp::lock_guard ml(qmtx); CndWaiter *old_back = nullptr; if (waitq_front == nullptr) { waitq_front = waitq_back = &waiter; @@ -83,7 +84,7 @@ void CndVar::notify_one() { } void CndVar::broadcast() { - MutexLock ml(&qmtx); + cpp::lock_guard ml(qmtx); uint32_t dummy_futex_word; CndWaiter *waiter = waitq_front; waitq_front = waitq_back = nullptr; diff --git a/libc/src/__support/threads/mutex.h b/libc/src/__support/threads/mutex.h index 9dded2e3f952a1..392b38984dc0ae 100644 --- a/libc/src/__support/threads/mutex.h +++ b/libc/src/__support/threads/mutex.h @@ -43,18 +43,4 @@ #include "src/__support/threads/gpu/mutex.h" #endif // __linux__ -namespace LIBC_NAMESPACE { - -// An RAII class for easy locking and unlocking of mutexes. -class MutexLock { - Mutex *mutex; - -public: - explicit MutexLock(Mutex *m) : mutex(m) { mutex->lock(); } - - ~MutexLock() { mutex->unlock(); } -}; - -} // namespace LIBC_NAMESPACE - #endif // LLVM_LIBC_SRC___SUPPORT_THREADS_MUTEX_H From c179d50fd3d84311708701d84e3bca60570d3d7f Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 28 May 2024 16:10:11 -0700 Subject: [PATCH 032/230] [WebAssembly] Add exnref type (#93586) This adds (back) the exnref type restored in the new EH proposal adopted in Oct 2023 CG meeting: https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md:x --- lld/wasm/WriterUtils.cpp | 2 ++ llvm/include/llvm/BinaryFormat/Wasm.h | 9 ++++--- llvm/include/llvm/CodeGen/ValueTypes.td | 9 ++++--- llvm/include/llvm/IR/Intrinsics.td | 2 ++ llvm/include/llvm/IR/IntrinsicsWebAssembly.td | 18 +++++++++++++ llvm/lib/CodeGen/ValueTypes.cpp | 1 + llvm/lib/Object/WasmObjectFile.cpp | 8 ++++-- llvm/lib/ObjectYAML/WasmYAML.cpp | 2 ++ .../MCTargetDesc/WebAssemblyMCTargetDesc.h | 12 +++++++++ .../WebAssemblyMCTypeUtilities.cpp | 6 +++++ .../MCTargetDesc/WebAssemblyMCTypeUtilities.h | 4 ++- .../Utils/WebAssemblyTypeUtilities.cpp | 3 +++ .../WebAssembly/WebAssemblyAsmPrinter.cpp | 2 ++ .../WebAssembly/WebAssemblyExplicitLocals.cpp | 10 +++++++ .../WebAssembly/WebAssemblyFastISel.cpp | 16 ++++++++++++ .../WebAssembly/WebAssemblyISelLowering.cpp | 3 +++ .../WebAssembly/WebAssemblyInstrInfo.td | 3 +++ .../Target/WebAssembly/WebAssemblyInstrRef.td | 8 +++--- .../WebAssembly/WebAssemblyInstrTable.td | 2 ++ .../WebAssembly/WebAssemblyRegStackify.cpp | 2 ++ .../WebAssembly/WebAssemblyRegisterInfo.td | 2 ++ .../WebAssembly/WebAssemblyUtilities.cpp | 2 ++ .../test/CodeGen/WebAssembly/reg-argument.mir | 11 ++++++++ llvm/test/CodeGen/WebAssembly/reg-copy.mir | 11 ++++++++ llvm/test/MC/WebAssembly/basic-assembly.s | 21 +++++++++------ llvm/test/MC/WebAssembly/reference-types.s | 26 +++++++++++++++++-- .../test/MC/WebAssembly/type-checker-errors.s | 16 ++++++++++++ 27 files changed, 188 insertions(+), 23 deletions(-) diff --git a/lld/wasm/WriterUtils.cpp b/lld/wasm/WriterUtils.cpp index cdd2c42f939efe..c6a1592012e64c 100644 --- a/lld/wasm/WriterUtils.cpp +++ b/lld/wasm/WriterUtils.cpp @@ -35,6 +35,8 @@ std::string toString(ValType type) { return "funcref"; case ValType::EXTERNREF: return "externref"; + case ValType::EXNREF: + return "exnref"; case ValType::OTHERREF: return "otherref"; } diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h index 38ef8e37df91d3..acf89885af6fdb 100644 --- a/llvm/include/llvm/BinaryFormat/Wasm.h +++ b/llvm/include/llvm/BinaryFormat/Wasm.h @@ -58,15 +58,16 @@ enum : unsigned { WASM_TYPE_V128 = 0x7B, WASM_TYPE_NULLFUNCREF = 0x73, WASM_TYPE_NULLEXTERNREF = 0x72, + WASM_TYPE_NULLEXNREF = 0x74, WASM_TYPE_NULLREF = 0x71, WASM_TYPE_FUNCREF = 0x70, WASM_TYPE_EXTERNREF = 0x6F, + WASM_TYPE_EXNREF = 0x69, WASM_TYPE_ANYREF = 0x6E, WASM_TYPE_EQREF = 0x6D, WASM_TYPE_I31REF = 0x6C, WASM_TYPE_STRUCTREF = 0x6B, WASM_TYPE_ARRAYREF = 0x6A, - WASM_TYPE_EXNREF = 0x69, WASM_TYPE_NONNULLABLE = 0x64, WASM_TYPE_NULLABLE = 0x63, WASM_TYPE_FUNC = 0x60, @@ -261,8 +262,9 @@ enum class ValType { V128 = WASM_TYPE_V128, FUNCREF = WASM_TYPE_FUNCREF, EXTERNREF = WASM_TYPE_EXTERNREF, + EXNREF = WASM_TYPE_EXNREF, // Unmodeled value types include ref types with heap types other than - // func or extern, and type-specialized funcrefs + // func, extern or exn, and type-specialized funcrefs OTHERREF = 0xff, }; @@ -410,7 +412,8 @@ struct WasmDataSegment { // 1) Does not model passive or declarative segments (Segment will end up with // an Offset field of i32.const 0) // 2) Does not model init exprs (Segment will get an empty Functions list) -// 2) Does not model types other than basic funcref/externref (see ValType) +// 3) Does not model types other than basic funcref/externref/exnref (see +// ValType) struct WasmElemSegment { uint32_t Flags; uint32_t TableNumber; diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index c3e378ed8f6edb..e322cc04c1c769 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -280,11 +280,12 @@ def untyped : ValueType<8, 193> { // Produces an untyped value } def funcref : ValueType<0, 194>; // WebAssembly's funcref type def externref : ValueType<0, 195>; // WebAssembly's externref type -def x86amx : ValueType<8192, 196>; // X86 AMX value -def i64x8 : ValueType<512, 197>; // 8 Consecutive GPRs (AArch64) +def exnref : ValueType<0, 196>; // WebAssembly's exnref type +def x86amx : ValueType<8192, 197>; // X86 AMX value +def i64x8 : ValueType<512, 198>; // 8 Consecutive GPRs (AArch64) def aarch64svcount - : ValueType<16, 198>; // AArch64 predicate-as-counter -def spirvbuiltin : ValueType<0, 199>; // SPIR-V's builtin type + : ValueType<16, 199>; // AArch64 predicate-as-counter +def spirvbuiltin : ValueType<0, 200>; // SPIR-V's builtin type def token : ValueType<0, 248>; // TokenTy def MetadataVT : ValueType<0, 249> { // Metadata diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 3019f68083d422..c3ac53837444ef 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -316,6 +316,7 @@ def IIT_PPCF128 : IIT_VT; def IIT_V3 : IIT_Vec<3, 53>; def IIT_EXTERNREF : IIT_VT; def IIT_FUNCREF : IIT_VT; +def IIT_EXNREF: IIT_VT; def IIT_I2 : IIT_Int<2, 57>; def IIT_I4 : IIT_Int<4, 58>; def IIT_AARCH64_SVCOUNT : IIT_VT; @@ -581,6 +582,7 @@ def llvm_vararg_ty : LLVMType; // this means vararg here def llvm_externref_ty : LLVMType; def llvm_funcref_ty : LLVMType; +def llvm_exnref_ty : LLVMType; //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td index 237f268784bb02..47aab196a6d4f9 100644 --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -31,12 +31,17 @@ def int_wasm_ref_null_extern : DefaultAttrsIntrinsic<[llvm_externref_ty], [], [IntrNoMem]>; def int_wasm_ref_null_func : DefaultAttrsIntrinsic<[llvm_funcref_ty], [], [IntrNoMem]>; +def int_wasm_ref_null_exn: + DefaultAttrsIntrinsic<[llvm_exnref_ty], [], [IntrNoMem]>; def int_wasm_ref_is_null_extern : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_externref_ty], [IntrNoMem], "llvm.wasm.ref.is_null.extern">; def int_wasm_ref_is_null_func : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_funcref_ty], [IntrNoMem], "llvm.wasm.ref.is_null.func">; +def int_wasm_ref_is_null_exn : + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_exnref_ty], [IntrNoMem], + "llvm.wasm.ref.is_null.exn">; //===----------------------------------------------------------------------===// // Table intrinsics @@ -47,6 +52,9 @@ def int_wasm_table_set_externref : def int_wasm_table_set_funcref : DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty], [IntrWriteMem]>; +def int_wasm_table_set_exnref : + DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty], + [IntrWriteMem]>; def int_wasm_table_get_externref : DefaultAttrsIntrinsic<[llvm_externref_ty], [llvm_table_ty, llvm_i32_ty], @@ -54,6 +62,9 @@ def int_wasm_table_get_externref : def int_wasm_table_get_funcref : DefaultAttrsIntrinsic<[llvm_funcref_ty], [llvm_table_ty, llvm_i32_ty], [IntrReadMem]>; +def int_wasm_table_get_exnref : + DefaultAttrsIntrinsic<[llvm_exnref_ty], [llvm_table_ty, llvm_i32_ty], + [IntrReadMem]>; // Query the current table size, and increase the current table size. def int_wasm_table_size : @@ -68,6 +79,9 @@ def int_wasm_table_grow_externref : def int_wasm_table_grow_funcref : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_table_ty, llvm_funcref_ty, llvm_i32_ty], []>; +def int_wasm_table_grow_exnref : + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_table_ty, llvm_exnref_ty, llvm_i32_ty], []>; def int_wasm_table_fill_externref : DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_externref_ty, @@ -76,6 +90,10 @@ def int_wasm_table_fill_funcref : DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty, llvm_i32_ty], []>; +def int_wasm_table_fill_exnref : + DefaultAttrsIntrinsic<[], + [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty, + llvm_i32_ty], []>; //===----------------------------------------------------------------------===// // Trapping float-to-int conversions diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 3d5c58d282da56..df1c02c3dc67c2 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -181,6 +181,7 @@ std::string EVT::getEVTString() const { case MVT::Metadata: return "Metadata"; case MVT::Untyped: return "Untyped"; case MVT::funcref: return "funcref"; + case MVT::exnref: return "exnref"; case MVT::externref: return "externref"; case MVT::aarch64svcount: return "aarch64svcount"; diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp index 6507a0e5950ebe..23381955c60a88 100644 --- a/llvm/lib/Object/WasmObjectFile.cpp +++ b/llvm/lib/Object/WasmObjectFile.cpp @@ -177,8 +177,8 @@ static uint8_t readOpcode(WasmObjectFile::ReadContext &Ctx) { static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx, uint32_t Code) { - // only directly encoded FUNCREF/EXTERNREF are supported - // (not ref null func or ref null extern) + // only directly encoded FUNCREF/EXTERNREF/EXNREF are supported + // (not ref null func, ref null extern, or ref null exn) switch (Code) { case wasm::WASM_TYPE_I32: case wasm::WASM_TYPE_I64: @@ -187,6 +187,7 @@ static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx, case wasm::WASM_TYPE_V128: case wasm::WASM_TYPE_FUNCREF: case wasm::WASM_TYPE_EXTERNREF: + case wasm::WASM_TYPE_EXNREF: return wasm::ValType(Code); } if (Code == wasm::WASM_TYPE_NULLABLE || Code == wasm::WASM_TYPE_NONNULLABLE) { @@ -1288,6 +1289,7 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) { auto ElemType = Im.Table.ElemType; if (ElemType != wasm::ValType::FUNCREF && ElemType != wasm::ValType::EXTERNREF && + ElemType != wasm::ValType::EXNREF && ElemType != wasm::ValType::OTHERREF) return make_error("invalid table element type", object_error::parse_failed); @@ -1346,6 +1348,7 @@ Error WasmObjectFile::parseTableSection(ReadContext &Ctx) { auto ElemType = Tables.back().Type.ElemType; if (ElemType != wasm::ValType::FUNCREF && ElemType != wasm::ValType::EXTERNREF && + ElemType != wasm::ValType::EXNREF && ElemType != wasm::ValType::OTHERREF) { return make_error("invalid table element type", object_error::parse_failed); @@ -1680,6 +1683,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) { Segment.ElemKind = parseValType(Ctx, ElemKind); if (Segment.ElemKind != wasm::ValType::FUNCREF && Segment.ElemKind != wasm::ValType::EXTERNREF && + Segment.ElemKind != wasm::ValType::EXNREF && Segment.ElemKind != wasm::ValType::OTHERREF) { return make_error("invalid elem type", object_error::parse_failed); diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp index 544a91d03dce01..7ad338f65706d5 100644 --- a/llvm/lib/ObjectYAML/WasmYAML.cpp +++ b/llvm/lib/ObjectYAML/WasmYAML.cpp @@ -606,6 +606,7 @@ void ScalarEnumerationTraits::enumeration( ECase(V128); ECase(FUNCREF); ECase(EXTERNREF); + ECase(EXNREF); ECase(OTHERREF); #undef ECase } @@ -640,6 +641,7 @@ void ScalarEnumerationTraits::enumeration( #define ECase(X) IO.enumCase(Type, #X, CONCAT(X)); ECase(FUNCREF); ECase(EXTERNREF); + ECase(EXNREF); ECase(OTHERREF); #undef ECase } diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 34502170a5c71f..b7498cb4299452 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -355,6 +355,8 @@ inline bool isArgument(unsigned Opc) { case WebAssembly::ARGUMENT_funcref_S: case WebAssembly::ARGUMENT_externref: case WebAssembly::ARGUMENT_externref_S: + case WebAssembly::ARGUMENT_exnref: + case WebAssembly::ARGUMENT_exnref_S: return true; default: return false; @@ -377,6 +379,8 @@ inline bool isCopy(unsigned Opc) { case WebAssembly::COPY_FUNCREF_S: case WebAssembly::COPY_EXTERNREF: case WebAssembly::COPY_EXTERNREF_S: + case WebAssembly::COPY_EXNREF: + case WebAssembly::COPY_EXNREF_S: return true; default: return false; @@ -399,6 +403,8 @@ inline bool isTee(unsigned Opc) { case WebAssembly::TEE_FUNCREF_S: case WebAssembly::TEE_EXTERNREF: case WebAssembly::TEE_EXTERNREF_S: + case WebAssembly::TEE_EXNREF: + case WebAssembly::TEE_EXNREF_S: return true; default: return false; @@ -489,6 +495,8 @@ inline bool isLocalGet(unsigned Opc) { case WebAssembly::LOCAL_GET_FUNCREF_S: case WebAssembly::LOCAL_GET_EXTERNREF: case WebAssembly::LOCAL_GET_EXTERNREF_S: + case WebAssembly::LOCAL_GET_EXNREF: + case WebAssembly::LOCAL_GET_EXNREF_S: return true; default: return false; @@ -511,6 +519,8 @@ inline bool isLocalSet(unsigned Opc) { case WebAssembly::LOCAL_SET_FUNCREF_S: case WebAssembly::LOCAL_SET_EXTERNREF: case WebAssembly::LOCAL_SET_EXTERNREF_S: + case WebAssembly::LOCAL_SET_EXNREF: + case WebAssembly::LOCAL_SET_EXNREF_S: return true; default: return false; @@ -533,6 +543,8 @@ inline bool isLocalTee(unsigned Opc) { case WebAssembly::LOCAL_TEE_FUNCREF_S: case WebAssembly::LOCAL_TEE_EXTERNREF: case WebAssembly::LOCAL_TEE_EXTERNREF_S: + case WebAssembly::LOCAL_TEE_EXNREF: + case WebAssembly::LOCAL_TEE_EXNREF_S: return true; default: return false; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp index 8ea02bd2ad1ff0..d9c8e22bbbaf5b 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp @@ -27,6 +27,7 @@ std::optional WebAssembly::parseType(StringRef Type) { wasm::ValType::V128) .Case("funcref", wasm::ValType::FUNCREF) .Case("externref", wasm::ValType::EXTERNREF) + .Case("exnref", wasm::ValType::EXNREF) .Default(std::nullopt); } @@ -40,6 +41,7 @@ WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) { .Case("v128", WebAssembly::BlockType::V128) .Case("funcref", WebAssembly::BlockType::Funcref) .Case("externref", WebAssembly::BlockType::Externref) + .Case("exnref", WebAssembly::BlockType::Exnref) .Case("void", WebAssembly::BlockType::Void) .Default(WebAssembly::BlockType::Invalid); } @@ -62,6 +64,8 @@ const char *WebAssembly::anyTypeToString(unsigned Type) { return "funcref"; case wasm::WASM_TYPE_EXTERNREF: return "externref"; + case wasm::WASM_TYPE_EXNREF: + return "exnref"; case wasm::WASM_TYPE_FUNC: return "func"; case wasm::WASM_TYPE_NORESULT: @@ -110,6 +114,8 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) { return wasm::ValType::FUNCREF; case WebAssembly::EXTERNREFRegClassID: return wasm::ValType::EXTERNREF; + case WebAssembly::EXNREFRegClassID: + return wasm::ValType::EXNREF; default: llvm_unreachable("unexpected type"); } diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h index 486cf264d13e2f..063ee4dba9068e 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h @@ -32,6 +32,7 @@ enum class BlockType : unsigned { V128 = unsigned(wasm::ValType::V128), Externref = unsigned(wasm::ValType::EXTERNREF), Funcref = unsigned(wasm::ValType::FUNCREF), + Exnref = unsigned(wasm::ValType::EXNREF), // Multivalue blocks (and other non-void blocks) are only emitted when the // blocks will never be exited and are at the ends of functions (see // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made @@ -41,7 +42,8 @@ enum class BlockType : unsigned { }; inline bool isRefType(wasm::ValType Type) { - return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF; + return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF || + Type == wasm::ValType::EXNREF; } // Convert ValType or a list/signature of ValTypes to a string. diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp index 867953b4e8d71d..f9293460e701a0 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp @@ -33,6 +33,7 @@ MVT WebAssembly::parseMVT(StringRef Type) { .Case("v2i64", MVT::v2i64) .Case("funcref", MVT::funcref) .Case("externref", MVT::externref) + .Case("exnref", MVT::exnref) .Default(MVT::INVALID_SIMPLE_VALUE_TYPE); } @@ -58,6 +59,8 @@ wasm::ValType WebAssembly::toValType(MVT Type) { return wasm::ValType::FUNCREF; case MVT::externref: return wasm::ValType::EXTERNREF; + case MVT::exnref: + return wasm::ValType::EXNREF; default: llvm_unreachable("unexpected type"); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 443558537da245..0b7ec6e74cab20 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -125,6 +125,8 @@ static char getInvokeSig(wasm::ValType VT) { return 'F'; case wasm::ValType::EXTERNREF: return 'X'; + case wasm::ValType::EXNREF: + return 'E'; default: llvm_unreachable("Unhandled wasm::ValType enum"); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp index 0159c44a79b76d..3c6a29311a10e4 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp @@ -100,6 +100,8 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) { return WebAssembly::DROP_FUNCREF; if (RC == &WebAssembly::EXTERNREFRegClass) return WebAssembly::DROP_EXTERNREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::DROP_EXNREF; llvm_unreachable("Unexpected register class"); } @@ -119,6 +121,8 @@ static unsigned getLocalGetOpcode(const TargetRegisterClass *RC) { return WebAssembly::LOCAL_GET_FUNCREF; if (RC == &WebAssembly::EXTERNREFRegClass) return WebAssembly::LOCAL_GET_EXTERNREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::LOCAL_GET_EXNREF; llvm_unreachable("Unexpected register class"); } @@ -138,6 +142,8 @@ static unsigned getLocalSetOpcode(const TargetRegisterClass *RC) { return WebAssembly::LOCAL_SET_FUNCREF; if (RC == &WebAssembly::EXTERNREFRegClass) return WebAssembly::LOCAL_SET_EXTERNREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::LOCAL_SET_EXNREF; llvm_unreachable("Unexpected register class"); } @@ -157,6 +163,8 @@ static unsigned getLocalTeeOpcode(const TargetRegisterClass *RC) { return WebAssembly::LOCAL_TEE_FUNCREF; if (RC == &WebAssembly::EXTERNREFRegClass) return WebAssembly::LOCAL_TEE_EXTERNREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::LOCAL_TEE_EXNREF; llvm_unreachable("Unexpected register class"); } @@ -176,6 +184,8 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) { return MVT::funcref; if (RC == &WebAssembly::EXTERNREFRegClass) return MVT::externref; + if (RC == &WebAssembly::EXNREFRegClass) + return MVT::exnref; llvm_unreachable("unrecognized register class"); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 26e13948bc9a68..aa3aa1b007a530 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -137,6 +137,10 @@ class WebAssemblyFastISel final : public FastISel { if (Subtarget->hasReferenceTypes()) return VT; break; + case MVT::exnref: + if (Subtarget->hasReferenceTypes() && Subtarget->hasExceptionHandling()) + return VT; + break; case MVT::f16: return MVT::f32; case MVT::v16i8: @@ -717,6 +721,10 @@ bool WebAssemblyFastISel::fastLowerArguments() { Opc = WebAssembly::ARGUMENT_externref; RC = &WebAssembly::EXTERNREFRegClass; break; + case MVT::exnref: + Opc = WebAssembly::ARGUMENT_exnref; + RC = &WebAssembly::EXNREFRegClass; + break; default: return false; } @@ -821,6 +829,9 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { case MVT::externref: ResultReg = createResultReg(&WebAssembly::EXTERNREFRegClass); break; + case MVT::exnref: + ResultReg = createResultReg(&WebAssembly::EXNREFRegClass); + break; default: return false; } @@ -948,6 +959,10 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { Opc = WebAssembly::SELECT_EXTERNREF; RC = &WebAssembly::EXTERNREFRegClass; break; + case MVT::exnref: + Opc = WebAssembly::SELECT_EXNREF; + RC = &WebAssembly::EXNREFRegClass; + break; default: return false; } @@ -1355,6 +1370,7 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) { case MVT::v2f64: case MVT::funcref: case MVT::externref: + case MVT::exnref: break; default: return false; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 518b6932a0c879..f9f16498bb390c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -76,6 +76,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( if (Subtarget->hasReferenceTypes()) { addRegisterClass(MVT::externref, &WebAssembly::EXTERNREFRegClass); addRegisterClass(MVT::funcref, &WebAssembly::FUNCREFRegClass); + if (Subtarget->hasExceptionHandling()) { + addRegisterClass(MVT::exnref, &WebAssembly::EXNREFRegClass); + } } // Compute derived properties from the register classes. computeRegisterProperties(Subtarget->getRegisterInfo()); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index c1a5a45395e87d..3d37eb2fa27bce 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -292,6 +292,7 @@ defm "": ARGUMENT; defm "": ARGUMENT; defm "": ARGUMENT; defm "": ARGUMENT; +defm "": ARGUMENT; // local.get and local.set are not generated by instruction selection; they // are implied by virtual register uses and defs. @@ -375,6 +376,8 @@ defm "" : LOCAL; defm "" : LOCAL, Requires<[HasSIMD128]>; defm "" : LOCAL, Requires<[HasReferenceTypes]>; defm "" : LOCAL, Requires<[HasReferenceTypes]>; +defm "" : LOCAL, + Requires<[HasReferenceTypes, HasExceptionHandling]>; let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in { defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm), diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td index 608963d588635e..2654a09387fd4a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td @@ -17,8 +17,9 @@ multiclass REF_I { [(set rc:$dst, (!cast("int_wasm_ref_null_" # ht)))], "ref.null_" # ht # "$dst", "ref.null_" # ht, - !cond(!eq(ht, "func") : 0xd070, - !eq(ht, "extern") : 0xd06f)>, + !cond(!eq(ht, "func") : 0xd070, + !eq(ht, "extern") : 0xd06f, + !eq(ht, "exn") : 0xd069)>, Requires<[HasReferenceTypes]>; defm SELECT_#rc: I<(outs rc:$dst), (ins rc:$lhs, rc:$rhs, I32:$cond), (outs), (ins), @@ -37,8 +38,9 @@ multiclass REF_I { defm "" : REF_I; defm "" : REF_I; +defm "" : REF_I; -foreach rc = [FUNCREF, EXTERNREF] in { +foreach rc = [FUNCREF, EXTERNREF, EXNREF] in { def : Pat<(select (i32 (setne I32:$cond, 0)), rc:$lhs, rc:$rhs), (!cast("SELECT_"#rc) rc:$lhs, rc:$rhs, I32:$cond)>; def : Pat<(select (i32 (seteq I32:$cond, 0)), rc:$lhs, rc:$rhs), diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td index 069ce5e3bc94a9..02f0ab8577c3d0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td @@ -64,6 +64,8 @@ multiclass TABLE { defm "" : TABLE, Requires<[HasReferenceTypes]>; defm "" : TABLE, Requires<[HasReferenceTypes]>; +defm "" : TABLE, + Requires<[HasReferenceTypes, HasExceptionHandling]>; def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r), (TABLE_SET_FUNCREF mcsym:$table, i32:$idx, funcref:$r)>, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index ef174e1716ef1e..d4edb6bf18d932 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -504,6 +504,8 @@ static unsigned getTeeOpcode(const TargetRegisterClass *RC) { return WebAssembly::TEE_EXTERNREF; if (RC == &WebAssembly::FUNCREFRegClass) return WebAssembly::TEE_FUNCREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::TEE_EXNREF; llvm_unreachable("Unexpected register class"); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td index 4e2faa608be077..17889dacc868c2 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td @@ -45,6 +45,7 @@ def V128_0: WebAssemblyReg<"%v128">; def FUNCREF_0 : WebAssemblyReg<"%funcref.0">; def EXTERNREF_0 : WebAssemblyReg<"%externref.0">; +def EXNREF_0 : WebAssemblyReg<"%exnref.0">; // The value stack "register". This is an opaque entity which serves to order // uses and defs that must remain in LIFO order. @@ -68,3 +69,4 @@ def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8, 128, (add V128_0)>; def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>; def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>; +def EXNREF : WebAssemblyRegClass<[exnref], 0, (add EXNREF_0)>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index 60e872549f87d9..5e7279808cce63 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -175,6 +175,8 @@ unsigned WebAssembly::getCopyOpcodeForRegClass(const TargetRegisterClass *RC) { return WebAssembly::COPY_FUNCREF; case WebAssembly::EXTERNREFRegClassID: return WebAssembly::COPY_EXTERNREF; + case WebAssembly::EXNREFRegClassID: + return WebAssembly::COPY_EXNREF; default: llvm_unreachable("Unexpected register class"); } diff --git a/llvm/test/CodeGen/WebAssembly/reg-argument.mir b/llvm/test/CodeGen/WebAssembly/reg-argument.mir index 23e66dfc71fa1b..a549990bdb0a2b 100644 --- a/llvm/test/CodeGen/WebAssembly/reg-argument.mir +++ b/llvm/test/CodeGen/WebAssembly/reg-argument.mir @@ -68,3 +68,14 @@ body: | %1:externref = ARGUMENT_externref 0, implicit $arguments RETURN implicit-def $arguments ... +--- +name: argument_exnref +# CHECK-LABEL: argument_exnref +body: | + ; CHECK-LABEL: bb.0: + ; CHECK-NEXT: %1:exnref = ARGUMENT_exnref 0 + bb.0: + %0:i32 = CONST_I32 0, implicit-def $arguments + %1:exnref = ARGUMENT_exnref 0, implicit $arguments + RETURN implicit-def $arguments +... diff --git a/llvm/test/CodeGen/WebAssembly/reg-copy.mir b/llvm/test/CodeGen/WebAssembly/reg-copy.mir index 31a5bfa63a4ea2..763fe42d07b61a 100644 --- a/llvm/test/CodeGen/WebAssembly/reg-copy.mir +++ b/llvm/test/CodeGen/WebAssembly/reg-copy.mir @@ -77,3 +77,14 @@ body: | %0:externref = COPY %1:externref RETURN implicit-def $arguments ... +--- +name: copy_exnref +# CHECK-LABEL: copy_exnref +body: | + ; CHECK-LABEL: bb.0: + ; CHECK-NEXT: %0:exnref = COPY_EXNREF %1:exnref + ; CHECK-NEXT: RETURN + bb.0: + %0:exnref = COPY %1:exnref + RETURN implicit-def $arguments +... diff --git a/llvm/test/MC/WebAssembly/basic-assembly.s b/llvm/test/MC/WebAssembly/basic-assembly.s index 769cd7edfa8a3e..ac358c1b5c7a52 100644 --- a/llvm/test/MC/WebAssembly/basic-assembly.s +++ b/llvm/test/MC/WebAssembly/basic-assembly.s @@ -146,12 +146,14 @@ test0: .ident "clang version 9.0.0 (trunk 364502) (llvm/trunk 364571)" -.tabletype empty_eref_table, externref -empty_eref_table: +.tabletype empty_externref_table, externref +empty_externref_table: -.tabletype empty_fref_table, funcref -empty_fref_table: +.tabletype empty_funcref_table, funcref +empty_funcref_table: +.tabletype empty_exnref_table, exnref +empty_exnref_table: # CHECK: .text # CHECK: .globaltype __stack_pointer, i32 @@ -283,8 +285,11 @@ empty_fref_table: # CHECK-NEXT: .p2align 2 # CHECK-NEXT: .int32 test0 -# CHECK: .tabletype empty_eref_table, externref -# CHECK-NEXT: empty_eref_table: +# CHECK: .tabletype empty_externref_table, externref +# CHECK-NEXT: empty_externref_table: -# CHECK: .tabletype empty_fref_table, funcref -# CHECK-NEXT: empty_fref_table: +# CHECK: .tabletype empty_funcref_table, funcref +# CHECK-NEXT: empty_funcref_table: + +# CHECK: .tabletype empty_exnref_table, exnref +# CHECK-NEXT: empty_exnref_table: diff --git a/llvm/test/MC/WebAssembly/reference-types.s b/llvm/test/MC/WebAssembly/reference-types.s index ab3e3ee6b155b1..2f8bfba68dcea1 100644 --- a/llvm/test/MC/WebAssembly/reference-types.s +++ b/llvm/test/MC/WebAssembly/reference-types.s @@ -4,22 +4,27 @@ # CHECK-LABEL:ref_is_null: # CHECK: ref.is_null # encoding: [0xd1] ref_is_null: - .functype ref_is_null () -> (i32, i32) + .functype ref_is_null () -> (i32, i32, i32) ref.null_extern ref.is_null ref.null_func ref.is_null + ref.null_exn + ref.is_null end_function # CHECK-LABEL: ref_null_test: # CHECK: ref.null_func # encoding: [0xd0,0x70] # CHECK: ref.null_extern # encoding: [0xd0,0x6f] +# CHECK: ref.null_exn # encoding: [0xd0,0x69] ref_null_test: .functype ref_null_test () -> () ref.null_func drop ref.null_extern drop + ref.null_exn + drop end_function # CHECK-LABEL: ref_sig_test_funcref: @@ -36,9 +41,17 @@ ref_sig_test_externref: local.get 0 end_function +# CHECK-LABEL: ref_sig_test_exnref: +# CHECK-NEXT: .functype ref_sig_test_exnref (exnref) -> (exnref) +ref_sig_test_exnref: + .functype ref_sig_test_exnref (exnref) -> (exnref) + local.get 0 + end_function + # CHECK-LABEL: ref_select_test: # CHECK: funcref.select # encoding: [0x1b] # CHECK: externref.select # encoding: [0x1b] +# CHECK: exnref.select # encoding: [0x1b] ref_select_test: .functype ref_select_test () -> () ref.null_func @@ -51,15 +64,24 @@ ref_select_test: i32.const 0 externref.select drop + ref.null_exn + ref.null_exn + i32.const 0 + exnref.select + drop end_function # CHECK-LABEL: ref_block_test: # CHECK: block funcref # CHECK: block externref +# CHECK: block exnref ref_block_test: - .functype ref_block_test () -> (externref, funcref) + .functype ref_block_test () -> (exnref, externref, funcref) block funcref block externref + block exnref + ref.null_exn + end_block ref.null_extern end_block ref.null_func diff --git a/llvm/test/MC/WebAssembly/type-checker-errors.s b/llvm/test/MC/WebAssembly/type-checker-errors.s index 5e28d117501e98..d2841250137a8c 100644 --- a/llvm/test/MC/WebAssembly/type-checker-errors.s +++ b/llvm/test/MC/WebAssembly/type-checker-errors.s @@ -215,6 +215,22 @@ table_fill_type_mismatch_3: table.fill valid_table end_function +table_fill_type_mismatch_4: + .functype table_fill_type_mismatch_4 () -> () + ref.null_exn + i32.const 1 +# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref + table.fill valid_table + end_function + +table_fill_type_mismatch_5: + .functype table_fill_type_mismatch_5 () -> () + ref.null_exn + i32.const 1 +# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref + table.fill valid_table + end_function + table_grow_non_exist_table: .functype table_grow_non_exist_table (externref, i32) -> (i32) local.get 0 From 4486fcba756bfa4c8729673a9533578232f0bc04 Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Tue, 28 May 2024 19:14:26 -0400 Subject: [PATCH 033/230] [libc] Add proxy header for float.h. (#93504) This is the continuation of https://github.com/llvm/llvm-project/pull/88674. Fixes #88433, #90496. --------- Co-authored-by: aniplcc --- libc/hdr/CMakeLists.txt | 10 ++++++ libc/hdr/float_macros.h | 22 ++++++++++++ libc/include/llvm-libc-macros/float-macros.h | 35 ++++++++----------- .../macros/properties/CMakeLists.txt | 2 +- libc/src/__support/macros/properties/types.h | 2 +- libc/src/math/generic/CMakeLists.txt | 4 +++ libc/src/math/generic/scalbn.cpp | 11 +++--- libc/src/math/generic/scalbnf.cpp | 11 +++--- libc/src/math/generic/scalbnf128.cpp | 13 +++---- libc/src/math/generic/scalbnl.cpp | 11 +++--- .../llvm-project-overlay/libc/BUILD.bazel | 7 +++- 11 files changed, 78 insertions(+), 50 deletions(-) create mode 100644 libc/hdr/float_macros.h diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt index 91b8cb71552a71..66b82c84dac499 100644 --- a/libc/hdr/CMakeLists.txt +++ b/libc/hdr/CMakeLists.txt @@ -87,4 +87,14 @@ add_proxy_header_library( libc.include.llvm-libc-macros.time_macros ) +add_proxy_header_library( + float_macros + HDRS + float_macros.h + DEPENDS + libc.include.llvm-libc-macros.float_macros + FULL_BUILD_DEPENDS + libc.include.float +) + add_subdirectory(types) diff --git a/libc/hdr/float_macros.h b/libc/hdr/float_macros.h new file mode 100644 index 00000000000000..a0ef5e29b98687 --- /dev/null +++ b/libc/hdr/float_macros.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from math.h ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_FLOAT_MACROS_H +#define LLVM_LIBC_HDR_FLOAT_MACROS_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-macros/float-macros.h" + +#else // Overlay mode + +#include + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_FLOAT_MACROS_H diff --git a/libc/include/llvm-libc-macros/float-macros.h b/libc/include/llvm-libc-macros/float-macros.h index 4fe8590c5f70c8..81c1df868bf6cd 100644 --- a/libc/include/llvm-libc-macros/float-macros.h +++ b/libc/include/llvm-libc-macros/float-macros.h @@ -9,21 +9,6 @@ #ifndef LLVM_LIBC_MACROS_FLOAT_MACROS_H #define LLVM_LIBC_MACROS_FLOAT_MACROS_H -// Suppress `#include_next is a language extension` warnings. -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wgnu-include-next" -#pragma clang diagnostic ignored "-Winclude-next-absolute-path" -#else // gcc -#pragma GCC system_header -#endif //__clang__ - -#include_next - -#ifdef __clang__ -#pragma clang diagnostic pop -#endif //__clang__ - #ifndef FLT_RADIX #define FLT_RADIX __FLT_RADIX__ #endif // FLT_RADIX @@ -32,9 +17,13 @@ #define FLT_EVAL_METHOD __FLT_EVAL_METHOD__ #endif // FLT_EVAL_METHOD -#ifndef DECIMAL_DIG -#define DECIMAL_DIG __DECIMAL_DIG__ -#endif // DECIMAL_DIG +#ifndef FLT_ROUNDS +#if __has_builtin(__builtin_flt_rounds) +#define FLT_ROUNDS __builtin_flt_rounds() +#else +#define FLT_ROUNDS 1 +#endif +#endif // FLT_ROUNDS #ifndef FLT_DECIMAL_DIG #define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__ @@ -48,6 +37,10 @@ #define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__ #endif // LDBL_DECIMAL_DIG +#ifndef DECIMAL_DIG +#define DECIMAL_DIG __DECIMAL_DIG__ +#endif // DECIMAL_DIG + #ifndef FLT_DIG #define FLT_DIG __FLT_DIG__ #endif // FLT_DIG @@ -97,15 +90,15 @@ #endif // LDBL_MAX #ifndef FLT_TRUE_MIN -#define FLT_TRUE_MIN __FLT_TRUE_MIN__ +#define FLT_TRUE_MIN __FLT_DENORM_MIN__ #endif // FLT_TRUE_MIN #ifndef DBL_TRUE_MIN -#define DBL_TRUE_MIN __DBL_TRUE_MIN__ +#define DBL_TRUE_MIN __DBL_DENORM_MIN__ #endif // DBL_TRUE_MIN #ifndef LDBL_TRUE_MIN -#define LDBL_TRUE_MIN __LDBL_TRUE_MIN__ +#define LDBL_TRUE_MIN __LDBL_DENORM_MIN__ #endif // LDBL_TRUE_MIN #ifndef FLT_EPSILON diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt index bbc45650f3fca3..7718aeaa3de5af 100644 --- a/libc/src/__support/macros/properties/CMakeLists.txt +++ b/libc/src/__support/macros/properties/CMakeLists.txt @@ -33,6 +33,6 @@ add_header_library( .compiler .cpu_features .os - libc.include.llvm-libc-macros.float_macros + libc.hdr.float_macros libc.include.llvm-libc-types.float128 ) diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h index d43cf99e6859be..781cf1b7a2b627 100644 --- a/libc/src/__support/macros/properties/types.h +++ b/libc/src/__support/macros/properties/types.h @@ -10,7 +10,7 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H -#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG +#include "hdr/float_macros.h" // LDBL_MANT_DIG #include "include/llvm-libc-types/float128.h" // float128 #include "src/__support/macros/properties/architectures.h" #include "src/__support/macros/properties/compiler.h" diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index daaf505008ca11..269bc6be5d8343 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -2933,6 +2933,7 @@ add_entrypoint_object( HDRS ../scalbn.h DEPENDS + libc.hdr.float_macros libc.src.__support.FPUtil.manipulation_functions COMPILE_OPTIONS -O3 @@ -2945,6 +2946,7 @@ add_entrypoint_object( HDRS ../scalbnf.h DEPENDS + libc.hdr.float_macros libc.src.__support.FPUtil.manipulation_functions COMPILE_OPTIONS -O3 @@ -2957,6 +2959,7 @@ add_entrypoint_object( HDRS ../scalbnl.h DEPENDS + libc.hdr.float_macros libc.src.__support.FPUtil.manipulation_functions COMPILE_OPTIONS -O3 @@ -2969,6 +2972,7 @@ add_entrypoint_object( HDRS ../scalbnf128.h DEPENDS + libc.hdr.float_macros libc.src.__support.macros.properties.types libc.src.__support.FPUtil.manipulation_functions COMPILE_OPTIONS diff --git a/libc/src/math/generic/scalbn.cpp b/libc/src/math/generic/scalbn.cpp index 3908f5892f144f..207cce1550bc01 100644 --- a/libc/src/math/generic/scalbn.cpp +++ b/libc/src/math/generic/scalbn.cpp @@ -7,19 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/math/scalbn.h" +#include "hdr/float_macros.h" #include "src/__support/FPUtil/ManipulationFunctions.h" #include "src/__support/common.h" +#if FLT_RADIX != 2 +#error "FLT_RADIX != 2 is not supported." +#endif + namespace LIBC_NAMESPACE { LLVM_LIBC_FUNCTION(double, scalbn, (double x, int n)) { -#if !defined(__FLT_RADIX__) -#error __FLT_RADIX__ undefined. -#elif __FLT_RADIX__ != 2 -#error __FLT_RADIX__!=2, unimplemented. -#else return fputil::ldexp(x, n); -#endif } } // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/scalbnf.cpp b/libc/src/math/generic/scalbnf.cpp index 4a4fa86dcfd895..e478088d3ce5a5 100644 --- a/libc/src/math/generic/scalbnf.cpp +++ b/libc/src/math/generic/scalbnf.cpp @@ -7,19 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/math/scalbnf.h" +#include "hdr/float_macros.h" #include "src/__support/FPUtil/ManipulationFunctions.h" #include "src/__support/common.h" +#if FLT_RADIX != 2 +#error "FLT_RADIX != 2 is not supported." +#endif + namespace LIBC_NAMESPACE { LLVM_LIBC_FUNCTION(float, scalbnf, (float x, int n)) { -#if !defined(__FLT_RADIX__) -#error __FLT_RADIX__ undefined. -#elif __FLT_RADIX__ != 2 -#error __FLT_RADIX__!=2, unimplemented. -#else return fputil::ldexp(x, n); -#endif } } // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/scalbnf128.cpp b/libc/src/math/generic/scalbnf128.cpp index be3d29ed27e985..5fd59611d53de7 100644 --- a/libc/src/math/generic/scalbnf128.cpp +++ b/libc/src/math/generic/scalbnf128.cpp @@ -7,21 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/math/scalbnf128.h" +#include "hdr/float_macros.h" #include "src/__support/FPUtil/ManipulationFunctions.h" #include "src/__support/common.h" +#if FLT_RADIX != 2 +#error "FLT_RADIX != 2 is not supported." +#endif + namespace LIBC_NAMESPACE { LLVM_LIBC_FUNCTION(float128, scalbnf128, (float128 x, int n)) { -// TODO: should be switched to use `FLT_RADIX` in hdr/float_macros.h" instead -// see: https://github.com/llvm/llvm-project/issues/90496 -#if !defined(__FLT_RADIX__) -#error __FLT_RADIX__ undefined. -#elif __FLT_RADIX__ != 2 -#error __FLT_RADIX__!=2, unimplemented. -#else return fputil::ldexp(x, n); -#endif } } // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/scalbnl.cpp b/libc/src/math/generic/scalbnl.cpp index 681338ec01f078..1225a7ebaf572d 100644 --- a/libc/src/math/generic/scalbnl.cpp +++ b/libc/src/math/generic/scalbnl.cpp @@ -7,19 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/math/scalbnl.h" +#include "hdr/float_macros.h" #include "src/__support/FPUtil/ManipulationFunctions.h" #include "src/__support/common.h" +#if FLT_RADIX != 2 +#error "FLT_RADIX != 2 is not supported." +#endif + namespace LIBC_NAMESPACE { LLVM_LIBC_FUNCTION(long double, scalbnl, (long double x, int n)) { -#if !defined(__FLT_RADIX__) -#error __FLT_RADIX__ undefined. -#elif __FLT_RADIX__ != 2 -#error __FLT_RADIX__!=2, unimplemented. -#else return fputil::ldexp(x, n); -#endif } } // namespace LIBC_NAMESPACE diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 446499cf15d7b4..70ec3a48a5e2e3 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -127,6 +127,11 @@ libc_support_library( hdrs = ["hdr/time_macros.h"], ) +libc_support_library( + name = "hdr_float_macros", + hdrs = ["hdr/float_macros.h"], +) + ############################ Type Proxy Header Files ########################### libc_support_library( @@ -189,7 +194,7 @@ libc_support_library( ":__support_macros_properties_compiler", ":__support_macros_properties_cpu_features", ":__support_macros_properties_os", - ":llvm_libc_macros_float_macros", + ":hdr_float_macros", ":llvm_libc_types_float128", ], ) From 39e5036c0e22cea24df73d28746bb8fe0a117f9d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 28 May 2024 16:25:54 -0700 Subject: [PATCH 034/230] [SCEV] Add predicated version of getSymbolicMaxBackedgeTakenCount. (#93498) This patch adds a predicated version of getSymbolicMaxBackedgeTakenCount. The intended use for this is loop access analysis for loops with uncountable exits. When analyzing dependences and computing runtime checks, we need the smallest upper bound on the number of iterations. In terms of memory safety, it shouldn't matter if any uncomputable exits leave the loop, as long as we prove that there are no dependences given the minimum of the countable exits. The same should apply also for generating runtime checks. PR: https://github.com/llvm/llvm-project/pull/93498 --- llvm/include/llvm/Analysis/ScalarEvolution.h | 19 +++++++- llvm/lib/Analysis/ScalarEvolution.cpp | 48 +++++++++++++++++-- ...cated-symbolic-max-backedge-taken-count.ll | 6 +++ 3 files changed, 67 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 1d016b28347d27..72f3d945424963 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -912,6 +912,13 @@ class ScalarEvolution { return getBackedgeTakenCount(L, SymbolicMaximum); } + /// Similar to getSymbolicMaxBackedgeTakenCount, except it will add a set of + /// SCEV predicates to Predicates that are required to be true in order for + /// the answer to be correct. Predicates can be checked with run-time + /// checks and can be used to perform loop versioning. + const SCEV *getPredicatedSymbolicMaxBackedgeTakenCount( + const Loop *L, SmallVector &Predicates); + /// Return true if the backedge taken count is either the value returned by /// getConstantMaxBackedgeTakenCount or zero. bool isBackedgeTakenCountMaxOrZero(const Loop *L); @@ -1549,7 +1556,9 @@ class ScalarEvolution { ScalarEvolution *SE) const; /// Get the symbolic max backedge taken count for the loop. - const SCEV *getSymbolicMax(const Loop *L, ScalarEvolution *SE); + const SCEV * + getSymbolicMax(const Loop *L, ScalarEvolution *SE, + SmallVector *Predicates = nullptr); /// Get the symbolic max backedge taken count for the particular loop exit. const SCEV *getSymbolicMax(const BasicBlock *ExitingBlock, @@ -1746,7 +1755,7 @@ class ScalarEvolution { /// Similar to getBackedgeTakenInfo, but will add predicates as required /// with the purpose of returning complete information. - const BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L); + BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L); /// Compute the number of times the specified loop will iterate. /// If AllowPredicates is set, we will create new SCEV predicates as @@ -2311,6 +2320,9 @@ class PredicatedScalarEvolution { /// Get the (predicated) backedge count for the analyzed loop. const SCEV *getBackedgeTakenCount(); + /// Get the (predicated) symbolic max backedge count for the analyzed loop. + const SCEV *getSymbolicMaxBackedgeTakenCount(); + /// Adds a new predicate. void addPredicate(const SCEVPredicate &Pred); @@ -2379,6 +2391,9 @@ class PredicatedScalarEvolution { /// The backedge taken count. const SCEV *BackedgeCount = nullptr; + + /// The symbolic backedge taken count. + const SCEV *SymbolicMaxBackedgeCount = nullptr; }; template <> struct DenseMapInfo { diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index bb56b41fe15d58..e46d7183a2a359 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -8295,6 +8295,11 @@ const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L, llvm_unreachable("Invalid ExitCountKind!"); } +const SCEV *ScalarEvolution::getPredicatedSymbolicMaxBackedgeTakenCount( + const Loop *L, SmallVector &Preds) { + return getPredicatedBackedgeTakenInfo(L).getSymbolicMax(L, this, &Preds); +} + bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) { return getBackedgeTakenInfo(L).isConstantMaxOrZero(this); } @@ -8311,7 +8316,7 @@ static void PushLoopPHIs(const Loop *L, Worklist.push_back(&PN); } -const ScalarEvolution::BackedgeTakenInfo & +ScalarEvolution::BackedgeTakenInfo & ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) { auto &BTI = getBackedgeTakenInfo(L); if (BTI.hasFullInfo()) @@ -8644,9 +8649,9 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const { return getConstantMax(); } -const SCEV * -ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L, - ScalarEvolution *SE) { +const SCEV *ScalarEvolution::BackedgeTakenInfo::getSymbolicMax( + const Loop *L, ScalarEvolution *SE, + SmallVector *Predicates) { if (!SymbolicMax) { // Form an expression for the maximum exit count possible for this loop. We // merge the max and exact information to approximate a version of @@ -8661,6 +8666,12 @@ ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L, "We should only have known counts for exiting blocks that " "dominate latch!"); ExitCounts.push_back(ExitCount); + if (Predicates) + for (const auto *P : ENT.Predicates) + Predicates->push_back(P); + + assert((Predicates || ENT.hasAlwaysTruePredicate()) && + "Predicate should be always true!"); } } if (ExitCounts.empty()) @@ -13609,6 +13620,24 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE, P->print(OS, 4); } + Preds.clear(); + auto *PredSymbolicMax = + SE->getPredicatedSymbolicMaxBackedgeTakenCount(L, Preds); + if (SymbolicBTC != PredSymbolicMax) { + OS << "Loop "; + L->getHeader()->printAsOperand(OS, /*PrintType=*/false); + OS << ": "; + if (!isa(PredSymbolicMax)) { + OS << "Predicated symbolic max backedge-taken count is "; + PrintSCEVWithTypeHint(OS, PredSymbolicMax); + } else + OS << "Unpredictable predicated symbolic max backedge-taken count."; + OS << "\n"; + OS << " Predicates:\n"; + for (const auto *P : Preds) + P->print(OS, 4); + } + if (SE->hasLoopInvariantBackedgeTakenCount(L)) { OS << "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); @@ -14822,6 +14851,17 @@ const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() { return BackedgeCount; } +const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() { + if (!SymbolicMaxBackedgeCount) { + SmallVector Preds; + SymbolicMaxBackedgeCount = + SE.getPredicatedSymbolicMaxBackedgeTakenCount(&L, Preds); + for (const auto *P : Preds) + addPredicate(*P); + } + return SymbolicMaxBackedgeCount; +} + void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) { if (Preds->implies(&Pred)) return; diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll index d40416359b65c6..8dc79a54eb97a5 100644 --- a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll +++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll @@ -12,6 +12,9 @@ define void @test1(i64 %x, ptr %a, ptr %b) { ; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. ; CHECK-NEXT: symbolic max exit count for header: ***COULDNOTCOMPUTE*** ; CHECK-NEXT: symbolic max exit count for latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: ; entry: br label %header @@ -52,6 +55,9 @@ define void @test2(i64 %x, ptr %a) { ; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. ; CHECK-NEXT: symbolic max exit count for header: ***COULDNOTCOMPUTE*** ; CHECK-NEXT: symbolic max exit count for latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: ; entry: br label %header From 722a5fce589cea76a0baf89ce731477bae8cf4b8 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 28 May 2024 16:27:04 -0700 Subject: [PATCH 035/230] [WebAssembly] Add -wasm-enable-exnref option (#93597) This adds `-wasm-enable-exnref`, which will enable the new EH instructions using `exnref` (adopted in Oct 2023 CG meeting): https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md This option should be used with `-wasm-enable-eh`. --- .../WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp | 7 +++++++ .../WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h | 1 + llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp | 4 ++++ llvm/test/CodeGen/WebAssembly/eh-option-errors.ll | 3 +++ 4 files changed, 15 insertions(+) diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index e8f58a19d25e3b..71dfe1062956e3 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -54,6 +54,13 @@ cl::opt // setjmp/longjmp handling using wasm EH instrutions cl::opt WebAssembly::WasmEnableSjLj( "wasm-enable-sjlj", cl::desc("WebAssembly setjmp/longjmp handling")); +// Whether we use the new exnref Wasm EH proposal adopted on Oct 2023. +// Should be used with -wasm-enable-eh. +// Currently set to false by default, but will later change to true and then +// later can be removed after the legacy WAsm EH instructions are removed. +cl::opt WebAssembly::WasmEnableExnref( + "wasm-enable-exnref", cl::desc("WebAssembly exception handling (exnref)"), + cl::init(false)); static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/, const Triple &TT, diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index b7498cb4299452..7f1a5f616ed484 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -44,6 +44,7 @@ extern cl::opt WasmEnableEmEH; // asm.js-style EH extern cl::opt WasmEnableEmSjLj; // asm.js-style SjLJ extern cl::opt WasmEnableEH; // EH using Wasm EH instructions extern cl::opt WasmEnableSjLj; // SjLj using Wasm EH instructions +extern cl::opt WasmEnableExnref; // EH using new Wasm EH (exnref) enum OperandType { /// Basic block label in a branch construct. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 68126992ddcd72..fd92a35c2638a5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -385,6 +385,7 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { using WebAssembly::WasmEnableEH; using WebAssembly::WasmEnableEmEH; using WebAssembly::WasmEnableEmSjLj; +using WebAssembly::WasmEnableExnref; using WebAssembly::WasmEnableSjLj; static void basicCheckForEHAndSjLj(TargetMachine *TM) { @@ -401,6 +402,9 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) { if (WasmEnableEmEH && WasmEnableSjLj) report_fatal_error( "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj"); + if (WasmEnableExnref && !WasmEnableEH) + report_fatal_error( + "-wasm-enable-exnref should be used with -wasm-enable-eh"); // Here we make sure TargetOptions.ExceptionModel is the same as // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang diff --git a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll index 74d02ddc405d3f..52a6364e122589 100644 --- a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll +++ b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll @@ -9,6 +9,9 @@ target triple = "wasm32-unknown-unknown" ; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_SJLJ ; EM_EH_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj +; RUN: not --crash llc < %s -wasm-enable-exnref 2>&1 | FileCheck %s --check-prefix=WASM_EXNREF_ONLY +; WASM_EXNREF_ONLY: LLVM ERROR: -wasm-enable-exnref should be used with -wasm-enable-eh + ; RUN: not --crash llc < %s -wasm-enable-eh -exception-model=dwarf 2>&1 | FileCheck %s --check-prefix=EH_MODEL_DWARF ; EH_MODEL_DWARF: LLVM ERROR: -exception-model should be either 'none' or 'wasm' From 60bce6eab4d734b86f49b7638856eb8899bc89e8 Mon Sep 17 00:00:00 2001 From: Brendan Dahl Date: Tue, 28 May 2024 16:33:20 -0700 Subject: [PATCH 036/230] [WebAssembly] Implement all f16x8 binary instructions. (#93360) This reuses most of the code that was created for f32x4 and f64x2 binary instructions and tries to follow how they were implemented. add/sub/mul/div - use regular LL instructions min/max - use the minimum/maximum intrinsic, and also have builtins pmin/pmax - use the wasm.pmax/pmin intrinsics and also have builtins Specified at: https://github.com/WebAssembly/half-precision/blob/29a9b9462c9285d4ccc1a5dc39214ddfd1892658/proposals/half-precision/Overview.md --- .../clang/Basic/BuiltinsWebAssembly.def | 4 ++ clang/lib/CodeGen/CGBuiltin.cpp | 4 ++ clang/test/CodeGen/builtins-wasm.c | 24 +++++++ .../WebAssembly/WebAssemblyISelLowering.cpp | 5 ++ .../WebAssembly/WebAssemblyInstrSIMD.td | 43 +++++++++--- .../CodeGen/WebAssembly/half-precision.ll | 68 +++++++++++++++++++ llvm/test/MC/WebAssembly/simd-encodings.s | 24 +++++++ 7 files changed, 163 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index fd8c1b480d6da0..4e48ff48b60f5f 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -135,6 +135,10 @@ TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_min_f16x8, "V8hV8hV8h", "nc", "half-precision") +TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "half-precision") +TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "half-precision") +TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "half-precision") TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 5edf8c79709131..a3c65105033247 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20806,6 +20806,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, } case WebAssembly::BI__builtin_wasm_min_f32: case WebAssembly::BI__builtin_wasm_min_f64: + case WebAssembly::BI__builtin_wasm_min_f16x8: case WebAssembly::BI__builtin_wasm_min_f32x4: case WebAssembly::BI__builtin_wasm_min_f64x2: { Value *LHS = EmitScalarExpr(E->getArg(0)); @@ -20816,6 +20817,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, } case WebAssembly::BI__builtin_wasm_max_f32: case WebAssembly::BI__builtin_wasm_max_f64: + case WebAssembly::BI__builtin_wasm_max_f16x8: case WebAssembly::BI__builtin_wasm_max_f32x4: case WebAssembly::BI__builtin_wasm_max_f64x2: { Value *LHS = EmitScalarExpr(E->getArg(0)); @@ -20824,6 +20826,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType())); return Builder.CreateCall(Callee, {LHS, RHS}); } + case WebAssembly::BI__builtin_wasm_pmin_f16x8: case WebAssembly::BI__builtin_wasm_pmin_f32x4: case WebAssembly::BI__builtin_wasm_pmin_f64x2: { Value *LHS = EmitScalarExpr(E->getArg(0)); @@ -20832,6 +20835,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType())); return Builder.CreateCall(Callee, {LHS, RHS}); } + case WebAssembly::BI__builtin_wasm_pmax_f16x8: case WebAssembly::BI__builtin_wasm_pmax_f32x4: case WebAssembly::BI__builtin_wasm_pmax_f64x2: { Value *LHS = EmitScalarExpr(E->getArg(0)); diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index 93a6ab06081c99..d6ee4f68700dca 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -825,6 +825,30 @@ float extract_lane_f16x8(f16x8 a, int i) { // WEBASSEMBLY-NEXT: ret float %0 return __builtin_wasm_extract_lane_f16x8(a, i); } + +f16x8 min_f16x8(f16x8 a, f16x8 b) { + // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.minimum.v8f16(<8 x half> %a, <8 x half> %b) + // WEBASSEMBLY-NEXT: ret <8 x half> %0 + return __builtin_wasm_min_f16x8(a, b); +} + +f16x8 max_f16x8(f16x8 a, f16x8 b) { + // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.maximum.v8f16(<8 x half> %a, <8 x half> %b) + // WEBASSEMBLY-NEXT: ret <8 x half> %0 + return __builtin_wasm_max_f16x8(a, b); +} + +f16x8 pmin_f16x8(f16x8 a, f16x8 b) { + // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b) + // WEBASSEMBLY-NEXT: ret <8 x half> %0 + return __builtin_wasm_pmin_f16x8(a, b); +} + +f16x8 pmax_f16x8(f16x8 a, f16x8 b) { + // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b) + // WEBASSEMBLY-NEXT: ret <8 x half> %0 + return __builtin_wasm_pmax_f16x8(a, b); +} __externref_t externref_null() { return __builtin_wasm_ref_null_extern(); // WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern() diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index f9f16498bb390c..4beab9d091581b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -145,6 +145,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setTruncStoreAction(T, MVT::f16, Expand); } + if (Subtarget->hasHalfPrecision()) { + setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); + } + // Expand unavailable integer operations. for (auto Op : {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 558e3d859dcd84..baf15ccdbe9edb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -16,33 +16,34 @@ multiclass ABSTRACT_SIMD_I pattern_r, string asmstr_r, string asmstr_s, bits<32> simdop, - Predicate simd_level> { + list reqs> { defm "" : I, - Requires<[simd_level]>; + Requires; } multiclass SIMD_I pattern_r, string asmstr_r = "", - string asmstr_s = "", bits<32> simdop = -1> { + string asmstr_s = "", bits<32> simdop = -1, + list reqs = []> { defm "" : ABSTRACT_SIMD_I; + asmstr_s, simdop, !listconcat([HasSIMD128], reqs)>; } multiclass RELAXED_I pattern_r, string asmstr_r = "", string asmstr_s = "", bits<32> simdop = -1> { defm "" : ABSTRACT_SIMD_I; + asmstr_s, simdop, [HasRelaxedSIMD]>; } multiclass HALF_PRECISION_I pattern_r, string asmstr_r = "", string asmstr_s = "", bits<32> simdop = -1> { defm "" : ABSTRACT_SIMD_I; + asmstr_s, simdop, [HasHalfPrecision]>; } @@ -152,6 +153,19 @@ def F64x2 : Vec { let prefix = "f64x2"; } +def F16x8 : Vec { + let vt = v8f16; + let int_vt = v8i16; + let lane_vt = f32; + let lane_rc = F32; + let lane_bits = 16; + let lane_idx = LaneIdx8; + let lane_load = int_wasm_loadf16_f32; + let splat = PatFrag<(ops node:$x), (v8f16 (splat_vector (f16 $x)))>; + let prefix = "f16x8"; +} + +// TODO: Include F16x8 here when half precision is better supported. defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2]; defvar IntVecs = [I8x16, I16x8, I32x4, I64x2]; @@ -781,13 +795,19 @@ def : Pat<(v2i64 (nodes[0] (v2f64 V128:$lhs), (v2f64 V128:$rhs))), // Bitwise operations //===----------------------------------------------------------------------===// -multiclass SIMDBinary simdop> { +multiclass SIMDBinary simdop, list reqs = []> { defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins), [(set (vec.vt V128:$dst), (node (vec.vt V128:$lhs), (vec.vt V128:$rhs)))], vec.prefix#"."#name#"\t$dst, $lhs, $rhs", - vec.prefix#"."#name, simdop>; + vec.prefix#"."#name, simdop, reqs>; +} + +multiclass HalfPrecisionBinary simdop> { + defm "" : SIMDBinary; } multiclass SIMDBitwise simdop, @@ -1199,6 +1219,7 @@ def : Pat<(v2f64 (froundeven (v2f64 V128:$src))), (NEAREST_F64x2 V128:$src)>; multiclass SIMDBinaryFP baseInst> { defm "" : SIMDBinary; defm "" : SIMDBinary; + defm "" : HalfPrecisionBinary; } // Addition: add @@ -1242,7 +1263,7 @@ defm PMAX : SIMDBinaryFP; // Also match the pmin/pmax cases where the operands are int vectors (but the // comparison is still a floating point comparison). This can happen when using // the wasm_simd128.h intrinsics because v128_t is an integer vector. -foreach vec = [F32x4, F64x2] in { +foreach vec = [F32x4, F64x2, F16x8] in { defvar pmin = !cast("PMIN_"#vec); defvar pmax = !cast("PMAX_"#vec); def : Pat<(vec.int_vt (vselect @@ -1266,6 +1287,10 @@ def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))), (PMIN_F64x2 V128:$lhs, V128:$rhs)>; def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))), (PMAX_F64x2 V128:$lhs, V128:$rhs)>; +def : Pat<(v8f16 (int_wasm_pmin (v8f16 V128:$lhs), (v8f16 V128:$rhs))), + (PMIN_F16x8 V128:$lhs, V128:$rhs)>; +def : Pat<(v8f16 (int_wasm_pmax (v8f16 V128:$lhs), (v8f16 V128:$rhs))), + (PMAX_F16x8 V128:$lhs, V128:$rhs)>; //===----------------------------------------------------------------------===// // Conversions diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index d9d3f6be800fdd..73ccea8d652db8 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -35,3 +35,71 @@ define float @extract_lane_v8f16(<8 x half> %v) { %r = call float @llvm.wasm.extract.lane.f16x8(<8 x half> %v, i32 1) ret float %r } + +; CHECK-LABEL: add_v8f16: +; CHECK: f16x8.add $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +define <8 x half> @add_v8f16(<8 x half> %a, <8 x half> %b) { + %r = fadd <8 x half> %a, %b + ret <8 x half> %r +} + +; CHECK-LABEL: sub_v8f16: +; CHECK: f16x8.sub $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +define <8 x half> @sub_v8f16(<8 x half> %a, <8 x half> %b) { + %r = fsub <8 x half> %a, %b + ret <8 x half> %r +} + +; CHECK-LABEL: mul_v8f16: +; CHECK: f16x8.mul $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +define <8 x half> @mul_v8f16(<8 x half> %a, <8 x half> %b) { + %r = fmul <8 x half> %a, %b + ret <8 x half> %r +} + +; CHECK-LABEL: div_v8f16: +; CHECK: f16x8.div $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +define <8 x half> @div_v8f16(<8 x half> %a, <8 x half> %b) { + %r = fdiv <8 x half> %a, %b + ret <8 x half> %r +} + +; CHECK-LABEL: min_intrinsic_v8f16: +; CHECK: f16x8.min $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>) +define <8 x half> @min_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) { + %a = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y) + ret <8 x half> %a +} + +; CHECK-LABEL: max_intrinsic_v8f16: +; CHECK: f16x8.max $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>) +define <8 x half> @max_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) { + %a = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y) + ret <8 x half> %a +} + +; CHECK-LABEL: pmin_intrinsic_v8f16: +; CHECK: f16x8.pmin $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +declare <8 x half> @llvm.wasm.pmin.v8f16(<8 x half>, <8 x half>) +define <8 x half> @pmin_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) { + %v = call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b) + ret <8 x half> %v +} + +; CHECK-LABEL: pmax_intrinsic_v8f16: +; CHECK: f16x8.pmax $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +declare <8 x half> @llvm.wasm.pmax.v8f16(<8 x half>, <8 x half>) +define <8 x half> @pmax_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) { + %v = call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b) + ret <8 x half> %v +} diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index d397188a9882ea..113a23da776fa9 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -851,4 +851,28 @@ main: # CHECK: f16x8.extract_lane 1 # encoding: [0xfd,0xa1,0x02,0x01] f16x8.extract_lane 1 + # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02] + f16x8.add + + # CHECK: f16x8.sub # encoding: [0xfd,0xb5,0x02] + f16x8.sub + + # CHECK: f16x8.mul # encoding: [0xfd,0xb6,0x02] + f16x8.mul + + # CHECK: f16x8.div # encoding: [0xfd,0xb7,0x02] + f16x8.div + + # CHECK: f16x8.min # encoding: [0xfd,0xb8,0x02] + f16x8.min + + # CHECK: f16x8.max # encoding: [0xfd,0xb9,0x02] + f16x8.max + + # CHECK: f16x8.pmin # encoding: [0xfd,0xba,0x02] + f16x8.pmin + + # CHECK: f16x8.pmax # encoding: [0xfd,0xbb,0x02] + f16x8.pmax + end_function From 0edc97f119f3ac3ff96b11183fe5c001a48a9a8d Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Tue, 28 May 2024 16:39:09 -0700 Subject: [PATCH 037/230] [IR][AArch64][PAC] Add "ptrauth(...)" Constant to represent signed pointers. (#85738) This defines a new kind of IR Constant that represents a ptrauth signed pointer, as used in AArch64 PAuth. It allows representing most kinds of signed pointer constants used thus far in the llvm ptrauth implementations, notably those used in the Darwin and ELF ABIs being implemented for c/c++. These signed pointer constants are then lowered to ELF/MachO relocations. These can be simply thought of as a constant `llvm.ptrauth.sign`, with the interesting addition of discriminator computation: the `ptrauth` constant can also represent a combined blend, when both address and integer discriminator operands are used. Both operands are otherwise optional, with default values 0/null. --- llvm/docs/LangRef.rst | 34 +++++ llvm/docs/PointerAuth.md | 22 ++++ llvm/include/llvm/AsmParser/LLToken.h | 1 + llvm/include/llvm/Bitcode/LLVMBitCodes.h | 1 + llvm/include/llvm/IR/Constants.h | 66 ++++++++++ llvm/include/llvm/IR/Value.def | 1 + llvm/lib/Analysis/ValueTracking.cpp | 4 + llvm/lib/AsmParser/LLLexer.cpp | 1 + llvm/lib/AsmParser/LLParser.cpp | 54 ++++++++ llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp | 1 + llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 25 +++- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 6 + llvm/lib/IR/AsmWriter.cpp | 21 +++ llvm/lib/IR/Constants.cpp | 121 ++++++++++++++++++ llvm/lib/IR/ConstantsContext.h | 47 +++++++ llvm/lib/IR/LLVMContextImpl.h | 2 + llvm/lib/IR/Verifier.cpp | 23 ++++ llvm/test/Assembler/invalid-ptrauth-const1.ll | 6 + llvm/test/Assembler/invalid-ptrauth-const2.ll | 6 + llvm/test/Assembler/invalid-ptrauth-const3.ll | 6 + llvm/test/Assembler/invalid-ptrauth-const4.ll | 6 + llvm/test/Assembler/invalid-ptrauth-const5.ll | 6 + llvm/test/Assembler/ptrauth-const.ll | 24 ++++ llvm/test/Bitcode/compatibility.ll | 4 + llvm/utils/vim/syntax/llvm.vim | 1 + 25 files changed, 488 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Assembler/invalid-ptrauth-const1.ll create mode 100644 llvm/test/Assembler/invalid-ptrauth-const2.ll create mode 100644 llvm/test/Assembler/invalid-ptrauth-const3.ll create mode 100644 llvm/test/Assembler/invalid-ptrauth-const4.ll create mode 100644 llvm/test/Assembler/invalid-ptrauth-const5.ll create mode 100644 llvm/test/Assembler/ptrauth-const.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 614dd98b013b35..7b64c477d13c7f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4754,6 +4754,40 @@ reference to the CFI jump table in the ``LowerTypeTests`` pass. These constants may be useful in low-level programs, such as operating system kernels, which need to refer to the actual function body. +.. _ptrauth_constant: + +Pointer Authentication Constants +-------------------------------- + +``ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?)`` + +A '``ptrauth``' constant represents a pointer with a cryptographic +authentication signature embedded into some bits, as described in the +`Pointer Authentication `__ document. + +A '``ptrauth``' constant is simply a constant equivalent to the +``llvm.ptrauth.sign`` intrinsic, potentially fed by a discriminator +``llvm.ptrauth.blend`` if needed. + +Its type is the same as the first argument. An integer constant discriminator +and an address discriminator may be optionally specified. Otherwise, they have +values ``i64 0`` and ``ptr null``. + +If the address discriminator is ``null`` then the expression is equivalent to + +.. code-block:: llvm + + %tmp = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 DISC) + %val = inttoptr i64 %tmp to ptr + +Otherwise, the expression is equivalent to: + +.. code-block:: llvm + + %tmp1 = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr ADDRDISC to i64), i64 DISC) + %tmp2 = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 %tmp1) + %val = inttoptr i64 %tmp2 to ptr + .. _constantexprs: Constant Expressions diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md index a8d2b4d8f5f0bd..cf2cc6305f130f 100644 --- a/llvm/docs/PointerAuth.md +++ b/llvm/docs/PointerAuth.md @@ -16,6 +16,7 @@ For more details, see the clang documentation page for At the IR level, it is represented using: * a [set of intrinsics](#intrinsics) (to sign/authenticate pointers) +* a [signed pointer constant](#constant) (to sign globals) * a [call operand bundle](#operand-bundle) (to authenticate called pointers) The current implementation leverages the @@ -225,6 +226,27 @@ with a pointer address discriminator, in a way that is specified by the target implementation. +### Constant + +[Intrinsics](#intrinsics) can be used to produce signed pointers dynamically, +in code, but not for signed pointers referenced by constants, in, e.g., global +initializers. + +The latter are represented using a +[``ptrauth`` constant](https://llvm.org/docs/LangRef.html#ptrauth-constant), +which describes an authenticated relocation producing a signed pointer. + +```llvm +ptrauth (ptr CST, i32 KEY, i64 DISC, ptr ADDRDISC) +``` + +is equivalent to: + +```llvm + %disc = call i64 @llvm.ptrauth.blend(i64 ptrtoint(ptr ADDRDISC to i64), i64 DISC) + %signedval = call i64 @llvm.ptrauth.sign(ptr CST, i32 KEY, i64 %disc) +``` + ### Operand Bundle Function pointers used as indirect call targets can be signed when materialized, diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index df61ec6ed30e0b..69821c22dcd619 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -346,6 +346,7 @@ enum Kind { kw_blockaddress, kw_dso_local_equivalent, kw_no_cfi, + kw_ptrauth, kw_freeze, diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index d3b9e96520f88a..9999aee61528e5 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -413,6 +413,7 @@ enum ConstantsCodes { // asmstr,conststr] CST_CODE_CE_GEP_WITH_INRANGE = 31, // [opty, flags, range, n x operands] CST_CODE_CE_GEP = 32, // [opty, flags, n x operands] + CST_CODE_PTRAUTH = 33, // [ptr, key, disc, addrdisc] }; /// CastOpcodes - These are values used in the bitcode files to encode which diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h index a1e5005a9d1da5..86f6be7985a23f 100644 --- a/llvm/include/llvm/IR/Constants.h +++ b/llvm/include/llvm/IR/Constants.h @@ -1008,6 +1008,72 @@ struct OperandTraits : public FixedNumOperandTraits { DEFINE_TRANSPARENT_OPERAND_ACCESSORS(NoCFIValue, Value) +/// A signed pointer, in the ptrauth sense. +class ConstantPtrAuth final : public Constant { + friend struct ConstantPtrAuthKeyType; + friend class Constant; + + ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc, + Constant *AddrDisc); + + void *operator new(size_t s) { return User::operator new(s, 4); } + + void destroyConstantImpl(); + Value *handleOperandChangeImpl(Value *From, Value *To); + +public: + /// Return a pointer signed with the specified parameters. + static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key, + ConstantInt *Disc, Constant *AddrDisc); + + /// Produce a new ptrauth expression signing the given value using + /// the same schema as is stored in one. + ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const; + + /// Transparently provide more efficient getOperand methods. + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant); + + /// The pointer that is signed in this ptrauth signed pointer. + Constant *getPointer() const { return cast(Op<0>().get()); } + + /// The Key ID, an i32 constant. + ConstantInt *getKey() const { return cast(Op<1>().get()); } + + /// The integer discriminator, an i64 constant, or 0. + ConstantInt *getDiscriminator() const { + return cast(Op<2>().get()); + } + + /// The address discriminator if any, or the null constant. + /// If present, this must be a value equivalent to the storage location of + /// the only global-initializer user of the ptrauth signed pointer. + Constant *getAddrDiscriminator() const { + return cast(Op<3>().get()); + } + + /// Whether there is any non-null address discriminator. + bool hasAddressDiscriminator() const { + return !getAddrDiscriminator()->isNullValue(); + } + + /// Check whether an authentication operation with key \p Key and (possibly + /// blended) discriminator \p Discriminator is known to be compatible with + /// this ptrauth signed pointer. + bool isKnownCompatibleWith(const Value *Key, const Value *Discriminator, + const DataLayout &DL) const; + + /// Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const Value *V) { + return V->getValueID() == ConstantPtrAuthVal; + } +}; + +template <> +struct OperandTraits + : public FixedNumOperandTraits {}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPtrAuth, Constant) + //===----------------------------------------------------------------------===// /// A constant value that is initialized with an expression using /// other constant values. diff --git a/llvm/include/llvm/IR/Value.def b/llvm/include/llvm/IR/Value.def index 61f7a87666d094..3ece66a529e125 100644 --- a/llvm/include/llvm/IR/Value.def +++ b/llvm/include/llvm/IR/Value.def @@ -81,6 +81,7 @@ HANDLE_CONSTANT(BlockAddress) HANDLE_CONSTANT(ConstantExpr) HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(DSOLocalEquivalent) HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(NoCFIValue) +HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(ConstantPtrAuth) // ConstantAggregate. HANDLE_CONSTANT(ConstantArray) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 3baa8ede28ffaf..08138a5e2f2d9d 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -3140,6 +3140,10 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, return true; } + // Constant ptrauth can be null, iff the base pointer can be. + if (auto *CPA = dyn_cast(V)) + return isKnownNonZero(CPA->getPointer(), DemandedElts, Q, Depth); + // A global variable in address space 0 is non null unless extern weak // or an absolute symbol reference. Other address spaces may have null as a // valid address for a global, so we can't assume anything. diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 20a1bd29577124..d3ab306904da12 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -710,6 +710,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(blockaddress); KEYWORD(dso_local_equivalent); KEYWORD(no_cfi); + KEYWORD(ptrauth); // Metadata types. KEYWORD(distinct); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 5d2056d2085672..df0827996396ef 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4046,6 +4046,60 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { ID.NoCFI = true; return false; } + case lltok::kw_ptrauth: { + // ValID ::= 'ptrauth' '(' ptr @foo ',' i32 + // (',' i64 (',' ptr addrdisc)? )? ')' + Lex.Lex(); + + Constant *Ptr, *Key; + Constant *Disc = nullptr, *AddrDisc = nullptr; + + if (parseToken(lltok::lparen, + "expected '(' in constant ptrauth expression") || + parseGlobalTypeAndValue(Ptr) || + parseToken(lltok::comma, + "expected comma in constant ptrauth expression") || + parseGlobalTypeAndValue(Key)) + return true; + // If present, parse the optional disc/addrdisc. + if (EatIfPresent(lltok::comma)) + if (parseGlobalTypeAndValue(Disc) || + (EatIfPresent(lltok::comma) && parseGlobalTypeAndValue(AddrDisc))) + return true; + if (parseToken(lltok::rparen, + "expected ')' in constant ptrauth expression")) + return true; + + if (!Ptr->getType()->isPointerTy()) + return error(ID.Loc, "constant ptrauth base pointer must be a pointer"); + + auto *KeyC = dyn_cast(Key); + if (!KeyC || KeyC->getBitWidth() != 32) + return error(ID.Loc, "constant ptrauth key must be i32 constant"); + + ConstantInt *DiscC = nullptr; + if (Disc) { + DiscC = dyn_cast(Disc); + if (!DiscC || DiscC->getBitWidth() != 64) + return error( + ID.Loc, + "constant ptrauth integer discriminator must be i64 constant"); + } else { + DiscC = ConstantInt::get(Type::getInt64Ty(Context), 0); + } + + if (AddrDisc) { + if (!AddrDisc->getType()->isPointerTy()) + return error( + ID.Loc, "constant ptrauth address discriminator must be a pointer"); + } else { + AddrDisc = ConstantPointerNull::get(PointerType::get(Context, 0)); + } + + ID.ConstantVal = ConstantPtrAuth::get(Ptr, KeyC, DiscC, AddrDisc); + ID.Kind = ValID::t_Constant; + return false; + } case lltok::kw_trunc: case lltok::kw_bitcast: diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp index c085c715179ba6..b7ed9cdf631454 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -222,6 +222,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID, STRINGIFY_CODE(CST_CODE, CE_UNOP) STRINGIFY_CODE(CST_CODE, DSO_LOCAL_EQUIVALENT) STRINGIFY_CODE(CST_CODE, NO_CFI_VALUE) + STRINGIFY_CODE(CST_CODE, PTRAUTH) case bitc::CST_CODE_BLOCKADDRESS: return "CST_CODE_BLOCKADDRESS"; STRINGIFY_CODE(CST_CODE, DATA) diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 32b9a033173e93..aee627bbde0bf5 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -517,7 +517,8 @@ class BitcodeConstant final : public Value, static constexpr uint8_t NoCFIOpcode = 252; static constexpr uint8_t DSOLocalEquivalentOpcode = 251; static constexpr uint8_t BlockAddressOpcode = 250; - static constexpr uint8_t FirstSpecialOpcode = BlockAddressOpcode; + static constexpr uint8_t ConstantPtrAuthOpcode = 249; + static constexpr uint8_t FirstSpecialOpcode = ConstantPtrAuthOpcode; // Separate struct to make passing different number of parameters to // BitcodeConstant::create() more convenient. @@ -1562,6 +1563,18 @@ Expected BitcodeReader::materializeValue(unsigned StartValID, C = ConstantExpr::get(BC->Opcode, ConstOps[0], ConstOps[1], BC->Flags); } else { switch (BC->Opcode) { + case BitcodeConstant::ConstantPtrAuthOpcode: { + auto *Key = dyn_cast(ConstOps[1]); + if (!Key) + return error("ptrauth key operand must be ConstantInt"); + + auto *Disc = dyn_cast(ConstOps[2]); + if (!Disc) + return error("ptrauth disc operand must be ConstantInt"); + + C = ConstantPtrAuth::get(ConstOps[0], Key, Disc, ConstOps[3]); + break; + } case BitcodeConstant::NoCFIOpcode: { auto *GV = dyn_cast(ConstOps[0]); if (!GV) @@ -3644,6 +3657,16 @@ Error BitcodeReader::parseConstants() { Record[1]); break; } + case bitc::CST_CODE_PTRAUTH: { + if (Record.size() < 4) + return error("Invalid ptrauth record"); + // Ptr, Key, Disc, AddrDisc + V = BitcodeConstant::create(Alloc, CurTy, + BitcodeConstant::ConstantPtrAuthOpcode, + {(unsigned)Record[0], (unsigned)Record[1], + (unsigned)Record[2], (unsigned)Record[3]}); + break; + } } assert(V->getType() == getTypeByID(CurTyID) && "Incorrect result type ID"); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 3d653fe4458f4b..046dad5721c4ce 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -2848,6 +2848,12 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, Code = bitc::CST_CODE_NO_CFI_VALUE; Record.push_back(VE.getTypeID(NC->getGlobalValue()->getType())); Record.push_back(VE.getValueID(NC->getGlobalValue())); + } else if (const auto *CPA = dyn_cast(C)) { + Code = bitc::CST_CODE_PTRAUTH; + Record.push_back(VE.getValueID(CPA->getPointer())); + Record.push_back(VE.getValueID(CPA->getKey())); + Record.push_back(VE.getValueID(CPA->getDiscriminator())); + Record.push_back(VE.getValueID(CPA->getAddrDiscriminator())); } else { #ifndef NDEBUG C->dump(); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index ced5d78f994ab5..8b1a21f962b08f 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1594,6 +1594,27 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, return; } + if (const ConstantPtrAuth *CPA = dyn_cast(CV)) { + Out << "ptrauth ("; + + // ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?) + unsigned NumOpsToWrite = 2; + if (!CPA->getOperand(2)->isNullValue()) + NumOpsToWrite = 3; + if (!CPA->getOperand(3)->isNullValue()) + NumOpsToWrite = 4; + + ListSeparator LS; + for (unsigned i = 0, e = NumOpsToWrite; i != e; ++i) { + Out << LS; + WriterCtx.TypePrinter->print(CPA->getOperand(i)->getType(), Out); + Out << ' '; + WriteAsOperandInternal(Out, CPA->getOperand(i), WriterCtx); + } + Out << ')'; + return; + } + if (const ConstantArray *CA = dyn_cast(CV)) { Type *ETy = CA->getType()->getElementType(); Out << '['; diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index cfb89d557db479..119fcb4fa03461 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -550,6 +550,9 @@ void llvm::deleteConstant(Constant *C) { case Constant::NoCFIValueVal: delete static_cast(C); break; + case Constant::ConstantPtrAuthVal: + delete static_cast(C); + break; case Constant::UndefValueVal: delete static_cast(C); break; @@ -2015,6 +2018,124 @@ Value *NoCFIValue::handleOperandChangeImpl(Value *From, Value *To) { return nullptr; } +//---- ConstantPtrAuth::get() implementations. +// + +ConstantPtrAuth *ConstantPtrAuth::get(Constant *Ptr, ConstantInt *Key, + ConstantInt *Disc, Constant *AddrDisc) { + Constant *ArgVec[] = {Ptr, Key, Disc, AddrDisc}; + ConstantPtrAuthKeyType MapKey(ArgVec); + LLVMContextImpl *pImpl = Ptr->getContext().pImpl; + return pImpl->ConstantPtrAuths.getOrCreate(Ptr->getType(), MapKey); +} + +ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const { + return get(Pointer, getKey(), getDiscriminator(), getAddrDiscriminator()); +} + +ConstantPtrAuth::ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, + ConstantInt *Disc, Constant *AddrDisc) + : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, &Op<0>(), 4) { + assert(Ptr->getType()->isPointerTy()); + assert(Key->getBitWidth() == 32); + assert(Disc->getBitWidth() == 64); + assert(AddrDisc->getType()->isPointerTy()); + setOperand(0, Ptr); + setOperand(1, Key); + setOperand(2, Disc); + setOperand(3, AddrDisc); +} + +/// Remove the constant from the constant table. +void ConstantPtrAuth::destroyConstantImpl() { + getType()->getContext().pImpl->ConstantPtrAuths.remove(this); +} + +Value *ConstantPtrAuth::handleOperandChangeImpl(Value *From, Value *ToV) { + assert(isa(ToV) && "Cannot make Constant refer to non-constant!"); + Constant *To = cast(ToV); + + SmallVector Values; + Values.reserve(getNumOperands()); + + unsigned NumUpdated = 0; + + Use *OperandList = getOperandList(); + unsigned OperandNo = 0; + for (Use *O = OperandList, *E = OperandList + getNumOperands(); O != E; ++O) { + Constant *Val = cast(O->get()); + if (Val == From) { + OperandNo = (O - OperandList); + Val = To; + ++NumUpdated; + } + Values.push_back(Val); + } + + return getContext().pImpl->ConstantPtrAuths.replaceOperandsInPlace( + Values, this, From, To, NumUpdated, OperandNo); +} + +bool ConstantPtrAuth::isKnownCompatibleWith(const Value *Key, + const Value *Discriminator, + const DataLayout &DL) const { + // If the keys are different, there's no chance for this to be compatible. + if (getKey() != Key) + return false; + + // We can have 3 kinds of discriminators: + // - simple, integer-only: `i64 x, ptr null` vs. `i64 x` + // - address-only: `i64 0, ptr p` vs. `ptr p` + // - blended address/integer: `i64 x, ptr p` vs. `@llvm.ptrauth.blend(p, x)` + + // If this constant has a simple discriminator (integer, no address), easy: + // it's compatible iff the provided full discriminator is also a simple + // discriminator, identical to our integer discriminator. + if (!hasAddressDiscriminator()) + return getDiscriminator() == Discriminator; + + // Otherwise, we can isolate address and integer discriminator components. + const Value *AddrDiscriminator = nullptr; + + // This constant may or may not have an integer discriminator (instead of 0). + if (!getDiscriminator()->isNullValue()) { + // If it does, there's an implicit blend. We need to have a matching blend + // intrinsic in the provided full discriminator. + if (!match(Discriminator, + m_Intrinsic( + m_Value(AddrDiscriminator), m_Specific(getDiscriminator())))) + return false; + } else { + // Otherwise, interpret the provided full discriminator as address-only. + AddrDiscriminator = Discriminator; + } + + // Either way, we can now focus on comparing the address discriminators. + + // Discriminators are i64, so the provided addr disc may be a ptrtoint. + if (auto *Cast = dyn_cast(AddrDiscriminator)) + AddrDiscriminator = Cast->getPointerOperand(); + + // Beyond that, we're only interested in compatible pointers. + if (getAddrDiscriminator()->getType() != AddrDiscriminator->getType()) + return false; + + // These are often the same constant GEP, making them trivially equivalent. + if (getAddrDiscriminator() == AddrDiscriminator) + return true; + + // Finally, they may be equivalent base+offset expressions. + APInt Off1(DL.getIndexTypeSizeInBits(getAddrDiscriminator()->getType()), 0); + auto *Base1 = getAddrDiscriminator()->stripAndAccumulateConstantOffsets( + DL, Off1, /*AllowNonInbounds=*/true); + + APInt Off2(DL.getIndexTypeSizeInBits(AddrDiscriminator->getType()), 0); + auto *Base2 = AddrDiscriminator->stripAndAccumulateConstantOffsets( + DL, Off2, /*AllowNonInbounds=*/true); + + return Base1 == Base2 && Off1 == Off2; +} + //---- ConstantExpr::get() implementations. // diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index 7067d0d121117b..5153880b5cab64 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -23,6 +23,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -286,6 +287,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value) template struct ConstantAggrKeyType; struct InlineAsmKeyType; struct ConstantExprKeyType; +struct ConstantPtrAuthKeyType; template struct ConstantInfo; template <> struct ConstantInfo { @@ -308,6 +310,10 @@ template <> struct ConstantInfo { using ValType = ConstantAggrKeyType; using TypeClass = VectorType; }; +template <> struct ConstantInfo { + using ValType = ConstantPtrAuthKeyType; + using TypeClass = Type; +}; template struct ConstantAggrKeyType { ArrayRef Operands; @@ -536,6 +542,47 @@ struct ConstantExprKeyType { } }; +struct ConstantPtrAuthKeyType { + ArrayRef Operands; + + ConstantPtrAuthKeyType(ArrayRef Operands) : Operands(Operands) {} + + ConstantPtrAuthKeyType(ArrayRef Operands, const ConstantPtrAuth *) + : Operands(Operands) {} + + ConstantPtrAuthKeyType(const ConstantPtrAuth *C, + SmallVectorImpl &Storage) { + assert(Storage.empty() && "Expected empty storage"); + for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I) + Storage.push_back(cast(C->getOperand(I))); + Operands = Storage; + } + + bool operator==(const ConstantPtrAuthKeyType &X) const { + return Operands == X.Operands; + } + + bool operator==(const ConstantPtrAuth *C) const { + if (Operands.size() != C->getNumOperands()) + return false; + for (unsigned I = 0, E = Operands.size(); I != E; ++I) + if (Operands[I] != C->getOperand(I)) + return false; + return true; + } + + unsigned getHash() const { + return hash_combine_range(Operands.begin(), Operands.end()); + } + + using TypeClass = typename ConstantInfo::TypeClass; + + ConstantPtrAuth *create(TypeClass *Ty) const { + return new ConstantPtrAuth(Operands[0], cast(Operands[1]), + cast(Operands[2]), Operands[3]); + } +}; + // Free memory for a given constant. Assumes the constant has already been // removed from all relevant maps. void deleteConstant(Constant *C); diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index 399fe0dad26c73..392e0d16f1761e 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -1562,6 +1562,8 @@ class LLVMContextImpl { DenseMap NoCFIValues; + ConstantUniqueMap ConstantPtrAuths; + ConstantUniqueMap ExprConstants; ConstantUniqueMap InlineAsms; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 50f8d6ec842017..684e54444621b5 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -629,6 +629,7 @@ class Verifier : public InstVisitor, VerifierSupport { void visitConstantExprsRecursively(const Constant *EntryC); void visitConstantExpr(const ConstantExpr *CE); + void visitConstantPtrAuth(const ConstantPtrAuth *CPA); void verifyInlineAsmCall(const CallBase &Call); void verifyStatepoint(const CallBase &Call); void verifyFrameRecoverIndices(); @@ -2422,6 +2423,9 @@ void Verifier::visitConstantExprsRecursively(const Constant *EntryC) { if (const auto *CE = dyn_cast(C)) visitConstantExpr(CE); + if (const auto *CPA = dyn_cast(C)) + visitConstantPtrAuth(CPA); + if (const auto *GV = dyn_cast(C)) { // Global Values get visited separately, but we do need to make sure // that the global value is in the correct module @@ -2449,6 +2453,23 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) { "Invalid bitcast", CE); } +void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) { + Check(CPA->getPointer()->getType()->isPointerTy(), + "signed ptrauth constant base pointer must have pointer type"); + + Check(CPA->getType() == CPA->getPointer()->getType(), + "signed ptrauth constant must have same type as its base pointer"); + + Check(CPA->getKey()->getBitWidth() == 32, + "signed ptrauth constant key must be i32 constant integer"); + + Check(CPA->getAddrDiscriminator()->getType()->isPointerTy(), + "signed ptrauth constant address discriminator must be a pointer"); + + Check(CPA->getDiscriminator()->getBitWidth() == 64, + "signed ptrauth constant discriminator must be i64 constant integer"); +} + bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) { // There shouldn't be more attribute sets than there are parameters plus the // function and return value. @@ -5090,6 +5111,8 @@ void Verifier::visitInstruction(Instruction &I) { } else if (isa(I.getOperand(i))) { Check(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i), "Cannot take the address of an inline asm!", &I); + } else if (auto *CPA = dyn_cast(I.getOperand(i))) { + visitConstantExprsRecursively(CPA); } else if (ConstantExpr *CE = dyn_cast(I.getOperand(i))) { if (CE->getType()->isPtrOrPtrVectorTy()) { // If we have a ConstantExpr pointer, we need to see if it came from an diff --git a/llvm/test/Assembler/invalid-ptrauth-const1.ll b/llvm/test/Assembler/invalid-ptrauth-const1.ll new file mode 100644 index 00000000000000..fba2e230782382 --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const1.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth base pointer must be a pointer +@auth_var = global ptr ptrauth (i32 42, i32 0) diff --git a/llvm/test/Assembler/invalid-ptrauth-const2.ll b/llvm/test/Assembler/invalid-ptrauth-const2.ll new file mode 100644 index 00000000000000..4499c42601c99e --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const2.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth key must be i32 constant +@auth_var = global ptr ptrauth (ptr @var, i32 ptrtoint (ptr @var to i32)) diff --git a/llvm/test/Assembler/invalid-ptrauth-const3.ll b/llvm/test/Assembler/invalid-ptrauth-const3.ll new file mode 100644 index 00000000000000..3f2688d92a0010 --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const3.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth address discriminator must be a pointer +@auth_var = global ptr ptrauth (ptr @var, i32 2, i64 65535, i8 0) diff --git a/llvm/test/Assembler/invalid-ptrauth-const4.ll b/llvm/test/Assembler/invalid-ptrauth-const4.ll new file mode 100644 index 00000000000000..843a220458a61b --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const4.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth integer discriminator must be i64 constant +@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr null, i64 ptrtoint (ptr @var to i64)) diff --git a/llvm/test/Assembler/invalid-ptrauth-const5.ll b/llvm/test/Assembler/invalid-ptrauth-const5.ll new file mode 100644 index 00000000000000..9b47f6f5f423fc --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const5.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth integer discriminator must be i64 constant +@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr @var)) diff --git a/llvm/test/Assembler/ptrauth-const.ll b/llvm/test/Assembler/ptrauth-const.ll new file mode 100644 index 00000000000000..94d35146d5927b --- /dev/null +++ b/llvm/test/Assembler/ptrauth-const.ll @@ -0,0 +1,24 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +@var = global i32 0 + +; CHECK: @basic = global ptr ptrauth (ptr @var, i32 0) +@basic = global ptr ptrauth (ptr @var, i32 0) + +; CHECK: @keyed = global ptr ptrauth (ptr @var, i32 3) +@keyed = global ptr ptrauth (ptr @var, i32 3) + +; CHECK: @intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1) +@intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1) + +; CHECK: @addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc) +@addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc) + + +@var1 = addrspace(1) global i32 0 + +; CHECK: @addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0) +@addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0) + +; CHECK: @addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc) +@addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc) diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll index b374924516d665..2a846e036924c7 100644 --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -217,6 +217,10 @@ declare void @g.f1() ; CHECK: @g.sanitize_address_dyninit = global i32 0, sanitize_address_dyninit ; CHECK: @g.sanitize_multiple = global i32 0, sanitize_memtag, sanitize_address_dyninit +; ptrauth constant +@auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535, ptr null) +; CHECK: @auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535) + ;; Aliases ; Format: @ = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal] ; [unnamed_addr] alias @ diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim index d86e3d1ddbc27f..905d696400ca37 100644 --- a/llvm/utils/vim/syntax/llvm.vim +++ b/llvm/utils/vim/syntax/llvm.vim @@ -150,6 +150,7 @@ syn keyword llvmKeyword \ preallocated \ private \ protected + \ ptrauth \ ptx_device \ ptx_kernel \ readnone From 6f529aaf666624c26715aa348955b26a684d1250 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 28 May 2024 23:37:40 +0000 Subject: [PATCH 038/230] [WebAssembly] Remove IIT_EXNREF This was added in #93586 but caused a compilation warning and is not used anyway. --- llvm/include/llvm/IR/Intrinsics.td | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index c3ac53837444ef..107442623ab7bd 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -316,7 +316,6 @@ def IIT_PPCF128 : IIT_VT; def IIT_V3 : IIT_Vec<3, 53>; def IIT_EXTERNREF : IIT_VT; def IIT_FUNCREF : IIT_VT; -def IIT_EXNREF: IIT_VT; def IIT_I2 : IIT_Int<2, 57>; def IIT_I4 : IIT_Int<4, 58>; def IIT_AARCH64_SVCOUNT : IIT_VT; From bd5cd4b837b67f8d549f072f37dd09295b4bf9f7 Mon Sep 17 00:00:00 2001 From: Eric Fiselier Date: Tue, 28 May 2024 20:01:47 -0400 Subject: [PATCH 039/230] Fix trigger for libc++ job rerunner. Testing github actions is such a pain. I swear it should match now. --- .github/workflows/restart-preempted-libcxx-jobs.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml index 5682b0a4f52c3d..88924fb3cd7791 100644 --- a/.github/workflows/restart-preempted-libcxx-jobs.yaml +++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml @@ -11,18 +11,16 @@ name: Restart Preempted Libc++ Workflow on: workflow_run: - workflows: - - Build and Test libc\+\+ + workflows: [Build and Test libc\+\+] types: - - failure - - canceled + - completed permissions: contents: read jobs: restart: - if: github.repository_owner == 'llvm' + if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure' || github.event.workflow_run.conclusion == 'cancelled') name: "Restart Job" permissions: statuses: read From 5bfe4b93e15ad38f211c5dec64be0eeaa4c8e914 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Tue, 28 May 2024 20:04:41 -0400 Subject: [PATCH 040/230] [mlir][arith] Disallow casting tensor dimensions (#93349) Tighten the verifier for arith cast ops to disallow changing tensor dimensions, e.g., static to dynamic. After this change: * `arith.cast_op %x : tensor<4xi32> to tensor<4xf32>` remains valid * `arith.cast_op %x : tensor<4xi32> to tensor` becomes invalid * `arith.cast_op %x : tensor to tensor<4xf32>` becomes invalid This is mostly to simplify the op semantics. See the discussion thread for more context: https://discourse.llvm.org/t/rfc-remove-arith-math-ops-on-tensors/74357/63. --- .../include/mlir/Dialect/Arith/IR/ArithOps.td | 19 +++++++-- mlir/test/Dialect/Arith/canonicalize.mlir | 8 ---- mlir/test/Dialect/Arith/invalid.mlir | 42 ++++++++++++++++++- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td index 46248dad3be9e0..81ed0f924a2e2c 100644 --- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td +++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td @@ -83,12 +83,25 @@ class Arith_FloatBinaryOp traits = []> : attr-dict `:` type($result) }]; } +// Checks that tensor input and outputs have identical shapes. This is stricker +// than the verification done in `SameOperandsAndResultShape` that allows for +// tensor dimensions to be 'compatible' (e.g., dynamic dimensions being +// compatible with static ones). +def SameInputOutputTensorDims : PredOpTrait< + "input and output have the same tensor dimensions", + AllMatchSameOperatorPred<["in", "out"], + "(::llvm::isa<::mlir::TensorType>($_self.getType()) ?" + " ::llvm::cast<::mlir::TensorType>($_self.getType()).getShape() :" + " ::llvm::ArrayRef{})">>; + // Base class for arithmetic cast operations. Requires a single operand and -// result. If either is a shaped type, then the other must be of the same shape. +// result. If either is a shaped type, then the other must be of the same +// shape. In the case of tensor types, this also includes the corresponding +// operand/result dimensions being equal. class Arith_CastOp traits = []> : Arith_Op]>, + SameInputOutputTensorDims, DeclareOpInterfaceMethods]>, Arguments<(ins From:$in)>, Results<(outs To:$out)> { let assemblyFormat = "$in attr-dict `:` type($in) `to` type($out)"; @@ -1231,7 +1244,7 @@ def Arith_TruncIOp : Arith_IToICastOp<"trunci"> { def Arith_TruncFOp : Arith_Op<"truncf", - [Pure, SameOperandsAndResultShape, + [Pure, SameOperandsAndResultShape, SameInputOutputTensorDims, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]>, Arguments<(ins FloatLike:$in, diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir index 1a387c20c4b297..e4f95bb0545a20 100644 --- a/mlir/test/Dialect/Arith/canonicalize.mlir +++ b/mlir/test/Dialect/Arith/canonicalize.mlir @@ -2950,14 +2950,6 @@ func.func @unsignedExtendConstantResource() -> tensor { return %ext : tensor } -// Just checks that this doesn't crash. -// CHECK-LABEL: @signedExtendSplatAsDynamicShape -func.func @signedExtendSplatAsDynamicShape() -> tensor { - %splat = arith.constant dense<5> : tensor<2xi16> - %extsplat = arith.extsi %splat : tensor<2xi16> to tensor - return %extsplat : tensor -} - // CHECK-LABEL: @extsi_i0 // CHECK: %[[ZERO:.*]] = arith.constant 0 : i16 // CHECK: return %[[ZERO]] : i16 diff --git a/mlir/test/Dialect/Arith/invalid.mlir b/mlir/test/Dialect/Arith/invalid.mlir index ada849220bb839..652aa738ad3924 100644 --- a/mlir/test/Dialect/Arith/invalid.mlir +++ b/mlir/test/Dialect/Arith/invalid.mlir @@ -1,13 +1,21 @@ // RUN: mlir-opt -split-input-file %s -verify-diagnostics func.func @test_index_cast_shape_error(%arg0 : tensor) -> tensor<2xi64> { - // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}} + // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}} %0 = arith.index_cast %arg0 : tensor to tensor<2xi64> return %0 : tensor<2xi64> } // ----- +func.func @test_index_cast_shape_dim_error(%arg0 : tensor<2xindex>) -> tensor { + // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.index_cast %arg0 : tensor<2xindex> to tensor + return %0 : tensor +} + +// ----- + func.func @test_index_cast_tensor_error(%arg0 : tensor) -> i64 { // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}} %0 = arith.index_cast %arg0 : tensor to i64 @@ -655,6 +663,14 @@ func.func @extsi_scalable_to_fl(%arg0 : vector<[4]xi32>) { // ----- +func.func @extsi_tensor_dim(%arg0 : tensor<4xi32>) { + // expected-error@+1 {{'arith.extsi' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.extsi %arg0 : tensor<4xi32> to tensor + return +} + +// ----- + func.func @extf_scalable_to_fl(%arg0 : vector<[4]xf32>) { // expected-error@+1 {{'arith.extf' op requires the same shape for all operands and results}} %0 = arith.extf %arg0 : vector<[4]xf32> to vector<4xf64> @@ -703,6 +719,22 @@ func.func @bitcast_scalable_to_fl(%arg0 : vector<[4]xf32>) { // ----- +func.func @bitcast_tensor_dim(%arg0 : tensor<4xf32>) { + // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.bitcast %arg0 : tensor<4xf32> to tensor + return +} + +// ----- + +func.func @bitcast_tensor_dim(%arg0 : tensor) { + // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.bitcast %arg0 : tensor to tensor<4xi32> + return +} + +// ----- + func.func @trunci_fl_to_scalable(%arg0 : vector<4xi32>) { // expected-error@+1 {{'arith.trunci' op requires the same shape for all operands and results}} %0 = arith.trunci %arg0 : vector<4xi32> to vector<[4]xi8> @@ -719,6 +751,14 @@ func.func @truncf_fl_to_scalable(%arg0 : vector<4xf64>) { // ----- +func.func @truncf_tensor_dim(%arg0 : tensor<4xf64>) { + // expected-error@+1 {{'arith.truncf' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.truncf %arg0 : tensor<4xf64> to tensor + return +} + +// ----- + func.func @extui_fl_to_scalable(%arg0 : vector<4xi32>) { // expected-error@+1 {{'arith.extui' op requires the same shape for all operands and results}} %0 = arith.extui %arg0 : vector<4xi32> to vector<[4]xi64> From 1c108c80dc5b878452c00e1411cb530a66122ea5 Mon Sep 17 00:00:00 2001 From: Sterling Augustine Date: Wed, 29 May 2024 00:27:07 +0000 Subject: [PATCH 041/230] Mark operator== const to avoid errors when asserts are enabled Without this change, the build will fail like so: llvm-project/lld/MachO/ObjC.cpp:1387:75: error: ISO C++20 considers use of overloaded operator '==' (with operand types 'ObjcCategoryMerger::PointerListInfo' and 'ObjcCategoryMerger::PointerListInfo') to be ambiguous despite there being a unique best viable function [-Werror,-Wambiguous-reversed-operator] 1387 | parseProtocolListInfo(classIsec, roClassLayout.baseProtocolsOffset) == | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ 1388 | parseProtocolListInfo(metaIsec, roClassLayout.baseProtocolsOffset) && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/assert.h:100:27: note: expanded from macro 'assert' 100 | (static_cast (expr) \ | ^~~~ llvm-project/lld/MachO/ObjC.cpp:391:17: note: ambiguity is between a regular call to this operator and a call with the argument order reversed 391 | inline bool operator==(const PointerListInfo &cmp) { | ^ llvm-project/lld/MachO/ObjC.cpp:391:17: note: mark 'operator==' as const or add a matching 'operator!=' to resolve the ambiguity 1 error generated. --- lld/MachO/ObjC.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp index 635ded554497ba..6e857cfcd92f6d 100644 --- a/lld/MachO/ObjC.cpp +++ b/lld/MachO/ObjC.cpp @@ -388,7 +388,7 @@ class ObjcCategoryMerger { : categoryPrefix(_categoryPrefix), pointersPerStruct(_pointersPerStruct) {} - inline bool operator==(const PointerListInfo &cmp) { + inline bool operator==(const PointerListInfo &cmp) const { return pointersPerStruct == cmp.pointersPerStruct && structSize == cmp.structSize && structCount == cmp.structCount && allPtrs == cmp.allPtrs; From 44d4b3b2eebdd5eed95dd78dc3939dd9f5ebc5e6 Mon Sep 17 00:00:00 2001 From: Hui Date: Wed, 29 May 2024 01:30:30 +0100 Subject: [PATCH 042/230] [libc++][test] Close LWG3382 and add tests (#93039) --- libcxx/docs/Status/Cxx20Issues.csv | 2 +- .../sequences/array/lwg3382.compile.pass.cpp | 25 +++++++++++++++++++ .../pairs/pairs.pair/lwg3382.compile.pass.cpp | 23 +++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp create mode 100644 libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index 5f83fa3a92e872..179958854e8cb2 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -285,7 +285,7 @@ "`3379 `__","""``safe``\ "" in several library names is misleading","Prague","|Complete|","15.0","|ranges|" "`3380 `__","``common_type``\ and comparison categories","Prague","|Complete|","15.0","|spaceship|" "`3381 `__","``begin``\ and ``data``\ must agree for ``contiguous_range``\ ","Prague","|Nothing To Do|","","|ranges|" -"`3382 `__","NTTP for ``pair``\ and ``array``\ ","Prague","","" +"`3382 `__","NTTP for ``pair``\ and ``array``\ ","Prague","|Nothing To Do|","" "`3383 `__","|sect|\ [time.zone.leap.nonmembers] ``sys_seconds``\ should be replaced with ``seconds``\ ","Prague","|Complete|","19.0","|chrono|" "`3384 `__","``transform_view::*sentinel*``\ has an incorrect ``operator-``\ ","Prague","|Complete|","15.0","|ranges|" "`3385 `__","``common_iterator``\ is not sufficiently constrained for non-copyable iterators","Prague","|Complete|","15.0","|ranges|" diff --git a/libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp b/libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp new file mode 100644 index 00000000000000..8eed20990cc00b --- /dev/null +++ b/libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +#include + +template +struct Test {}; + +void test() { + // LWG 3382. NTTP for pair and array + // https://cplusplus.github.io/LWG/issue3382 + constexpr std::array a{}; + [[maybe_unused]] Test test1{}; + + constexpr std::array b{}; + [[maybe_unused]] Test test2{}; +} diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp new file mode 100644 index 00000000000000..dce9a5df220b21 --- /dev/null +++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +#include + +template +struct Test {}; + +void test() { + // LWG 3382. NTTP for pair and array + // https://cplusplus.github.io/LWG/issue3382 +#if !defined(_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR) + constexpr std::pair a{}; + [[maybe_unused]] Test test1{}; +#endif +} From d868f097053e19e828d7366f5dbb88add16998a2 Mon Sep 17 00:00:00 2001 From: Hui Date: Wed, 29 May 2024 01:32:44 +0100 Subject: [PATCH 043/230] [libc++] LWG3223 Broken requirements for shared_ptr converting constructors (#93071) --- libcxx/docs/Status/Cxx20Issues.csv | 2 +- libcxx/include/__memory/shared_ptr.h | 7 ++- .../nullptr_t_deleter.pass.cpp | 20 ++++++++ .../nullptr_t_deleter_allocator.pass.cpp | 21 ++++++++ .../pointer_deleter.pass.cpp | 44 +++-------------- .../pointer_deleter_allocator.pass.cpp | 47 ++++-------------- .../util.smartptr.shared.const/types.h | 49 +++++++++++++++++++ 7 files changed, 113 insertions(+), 77 deletions(-) create mode 100644 libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index 179958854e8cb2..6fc40270af1580 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -200,7 +200,7 @@ "`3200 `__","``midpoint``\ should not constrain ``T``\ is complete","Prague","|Nothing To Do|","" "`3201 `__","``lerp``\ should be marked as ``noexcept``\ ","Prague","|Complete|","" "`3226 `__","``zoned_time``\ constructor from ``string_view``\ should accept ``zoned_time``\ ","Prague","","","|chrono|" -"`3233 `__","Broken requirements for ``shared_ptr``\ converting constructors","Prague","","" +"`3233 `__","Broken requirements for ``shared_ptr``\ converting constructors","Prague","|Complete|","19.0" "`3237 `__","LWG 3038 and 3190 have inconsistent PRs","Prague","|Complete|","16.0" "`3238 `__","Insufficiently-defined behavior of ``std::function``\ deduction guides","Prague","","" "`3242 `__","``std::format``\ : missing rules for ``arg-id``\ in ``width``\ and ``precision``\ ","Prague","|Complete|","14.0","|format|" diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h index 992b1ba43f100d..de5707c4a67b0c 100644 --- a/libcxx/include/__memory/shared_ptr.h +++ b/libcxx/include/__memory/shared_ptr.h @@ -403,6 +403,9 @@ struct __shared_ptr_deleter_ctor_reqs { __well_formed_deleter<_Dp, _Yp*>::value; }; +template +using __shared_ptr_nullptr_deleter_ctor_reqs = _And, __well_formed_deleter<_Dp, nullptr_t> >; + #if defined(_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI) # define _LIBCPP_SHARED_PTR_TRIVIAL_ABI __attribute__((__trivial_abi__)) #else @@ -498,7 +501,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr { #endif // _LIBCPP_HAS_NO_EXCEPTIONS } - template + template ::value, int> = 0 > _LIBCPP_HIDE_FROM_ABI shared_ptr(nullptr_t __p, _Dp __d) : __ptr_(nullptr) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { @@ -518,7 +521,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr { #endif // _LIBCPP_HAS_NO_EXCEPTIONS } - template + template ::value, int> = 0 > _LIBCPP_HIDE_FROM_ABI shared_ptr(nullptr_t __p, _Dp __d, _Alloc __a) : __ptr_(nullptr) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp index 49497b6956b9fb..13340ed5294c05 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp @@ -17,6 +17,7 @@ #include "test_macros.h" #include "deleter_types.h" +#include "types.h" struct A { static int count; @@ -28,6 +29,25 @@ struct A int A::count = 0; +// LWG 3233. Broken requirements for shared_ptr converting constructors +// https://cplusplus.github.io/LWG/issue3233 +static_assert( std::is_constructible, std::nullptr_t, test_deleter >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, bad_deleter>::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_nullptr_deleter>::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_move_deleter>::value, ""); + +#if TEST_STD_VER >= 17 +static_assert( std::is_constructible, std::nullptr_t, test_deleter >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, bad_deleter>::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_nullptr_deleter>::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_move_deleter>::value, ""); + +static_assert( std::is_constructible, std::nullptr_t, test_deleter >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, bad_deleter>::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_nullptr_deleter>::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_move_deleter>::value, ""); +#endif + int main(int, char**) { { diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp index 4e9fc227b99e81..53ca6fb5b234d4 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp @@ -17,6 +17,8 @@ #include "test_allocator.h" #include "min_allocator.h" +#include "types.h" + struct A { static int count; @@ -28,6 +30,25 @@ struct A int A::count = 0; +// LWG 3233. Broken requirements for shared_ptr converting constructors +// https://cplusplus.github.io/LWG/issue3233 +static_assert( std::is_constructible, std::nullptr_t, test_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, bad_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_nullptr_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_move_deleter, test_allocator >::value, ""); + +#if TEST_STD_VER >= 17 +static_assert( std::is_constructible, std::nullptr_t, test_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, bad_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_nullptr_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_move_deleter, test_allocator >::value, ""); + +static_assert( std::is_constructible, std::nullptr_t, test_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, bad_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_nullptr_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, std::nullptr_t, no_move_deleter, test_allocator >::value, ""); +#endif + int main(int, char**) { test_allocator_statistics alloc_stats; diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp index 42225a4b0be7ec..9c1e9b72be573c 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp @@ -17,6 +17,8 @@ #include "test_macros.h" #include "deleter_types.h" +#include "types.h" + struct A { static int count; @@ -28,38 +30,8 @@ struct A int A::count = 0; -struct bad_ty { }; - -struct bad_deleter -{ - void operator()(bad_ty) { } -}; - -struct no_move_deleter -{ - no_move_deleter(no_move_deleter const&) = delete; - no_move_deleter(no_move_deleter &&) = delete; - void operator()(int*) { } -}; - -static_assert(!std::is_move_constructible::value, ""); - -struct Base { }; -struct Derived : Base { }; - -template -class MoveDeleter -{ - MoveDeleter(); - MoveDeleter(MoveDeleter const&); -public: - MoveDeleter(MoveDeleter&&) {} - - explicit MoveDeleter(int) {} - - void operator()(T* ptr) { delete ptr; } -}; - +// LWG 3233. Broken requirements for shared_ptr converting constructors +// https://cplusplus.github.io/LWG/issue3233 // https://llvm.org/PR60258 // Invalid constructor SFINAE for std::shared_ptr's array ctors static_assert( std::is_constructible, int*, test_deleter >::value, ""); @@ -68,12 +40,12 @@ static_assert( std::is_constructible, Derived*, test_dele static_assert(!std::is_constructible, int*, test_deleter >::value, ""); #if TEST_STD_VER >= 17 -static_assert( std::is_constructible, int*, test_deleter>::value, ""); +static_assert( std::is_constructible, int*, test_deleter >::value, ""); static_assert(!std::is_constructible, int*, bad_deleter>::value, ""); -static_assert(!std::is_constructible, int(*)[], test_deleter>::value, ""); -static_assert( std::is_constructible, int*, test_deleter>::value, ""); +static_assert(!std::is_constructible, int(*)[], test_deleter >::value, ""); +static_assert( std::is_constructible, int*, test_deleter >::value, ""); static_assert(!std::is_constructible, int*, bad_deleter>::value, ""); -static_assert(!std::is_constructible, int(*)[5], test_deleter>::value, ""); +static_assert(!std::is_constructible, int(*)[5], test_deleter >::value, ""); #endif int main(int, char**) diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp index a110525b9b922d..9dffbcdd59a735 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp @@ -17,6 +17,7 @@ #include "test_allocator.h" #include "min_allocator.h" +#include "types.h" struct A { static int count; @@ -28,38 +29,8 @@ struct A int A::count = 0; -struct bad_ty { }; - -struct bad_deleter -{ - void operator()(bad_ty) { } -}; - -struct no_move_deleter -{ - no_move_deleter(no_move_deleter const&) = delete; - no_move_deleter(no_move_deleter &&) = delete; - void operator()(int*) { } -}; - -static_assert(!std::is_move_constructible::value, ""); - -struct Base { }; -struct Derived : Base { }; - -template -class MoveDeleter -{ - MoveDeleter(); - MoveDeleter(MoveDeleter const&); -public: - MoveDeleter(MoveDeleter&&) {} - - explicit MoveDeleter(int) {} - - void operator()(T* ptr) { delete ptr; } -}; - +// LWG 3233. Broken requirements for shared_ptr converting constructors +// https://cplusplus.github.io/LWG/issue3233 // https://llvm.org/PR60258 // Invalid constructor SFINAE for std::shared_ptr's array ctors static_assert( std::is_constructible, int*, test_deleter, test_allocator >::value, ""); @@ -68,12 +39,12 @@ static_assert( std::is_constructible, Derived*, test_dele static_assert(!std::is_constructible, int*, test_deleter, test_allocator >::value, ""); #if TEST_STD_VER >= 17 -static_assert( std::is_constructible, int*, test_deleter, test_allocator>::value, ""); -static_assert(!std::is_constructible, int*, bad_deleter, test_allocator>::value, ""); -static_assert(!std::is_constructible, int(*)[], test_deleter, test_allocator>::value, ""); -static_assert( std::is_constructible, int*, test_deleter, test_allocator>::value, ""); -static_assert(!std::is_constructible, int*, bad_deleter, test_allocator>::value, ""); -static_assert(!std::is_constructible, int(*)[5], test_deleter, test_allocator>::value, ""); +static_assert( std::is_constructible, int*, test_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, int*, bad_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, int(*)[], test_deleter, test_allocator >::value, ""); +static_assert( std::is_constructible, int*, test_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, int*, bad_deleter, test_allocator >::value, ""); +static_assert(!std::is_constructible, int(*)[5], test_deleter, test_allocator >::value, ""); #endif diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h new file mode 100644 index 00000000000000..5bfb3d70febea0 --- /dev/null +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H +#define TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H + +#include + +struct bad_ty {}; + +struct bad_deleter { + void operator()(bad_ty) {} +}; + +struct no_move_deleter { + no_move_deleter(no_move_deleter const&) = delete; + no_move_deleter(no_move_deleter&&) = delete; + void operator()(int*) {} +}; + +static_assert(!std::is_move_constructible::value, ""); + +struct no_nullptr_deleter { + void operator()(int*) const {} + void operator()(std::nullptr_t) const = delete; +}; + +struct Base {}; +struct Derived : Base {}; + +template +class MoveDeleter { + MoveDeleter(); + MoveDeleter(MoveDeleter const&); + +public: + MoveDeleter(MoveDeleter&&) {} + + explicit MoveDeleter(int) {} + + void operator()(T* ptr) { delete ptr; } +}; + +#endif // TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H From 2ae3f7c29c1149098827df7edafa761e3e3eb420 Mon Sep 17 00:00:00 2001 From: Hui Date: Wed, 29 May 2024 01:34:29 +0100 Subject: [PATCH 044/230] [libc++][test] Close LWG3238 and add tests (#93043) --- libcxx/docs/Status/Cxx20Issues.csv | 2 +- .../func.wrap.func.con/deduct_F.pass.cpp | 30 +++++++++++++++---- .../func.wrap.func.con/deduct_F.verify.cpp | 30 ------------------- 3 files changed, 26 insertions(+), 36 deletions(-) delete mode 100644 libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index 6fc40270af1580..54517ab002b86b 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -202,7 +202,7 @@ "`3226 `__","``zoned_time``\ constructor from ``string_view``\ should accept ``zoned_time``\ ","Prague","","","|chrono|" "`3233 `__","Broken requirements for ``shared_ptr``\ converting constructors","Prague","|Complete|","19.0" "`3237 `__","LWG 3038 and 3190 have inconsistent PRs","Prague","|Complete|","16.0" -"`3238 `__","Insufficiently-defined behavior of ``std::function``\ deduction guides","Prague","","" +"`3238 `__","Insufficiently-defined behavior of ``std::function``\ deduction guides","Prague","|Nothing To Do|","" "`3242 `__","``std::format``\ : missing rules for ``arg-id``\ in ``width``\ and ``precision``\ ","Prague","|Complete|","14.0","|format|" "`3243 `__","``std::format``\ and negative zeroes","Prague","|Complete|","14.0","|format|" "`3247 `__","``ranges::iter_move``\ should perform ADL-only lookup of ``iter_move``\ ","Prague","|Complete|","15.0","|ranges|" diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp index ef43ab9b64b5b5..381bcda761700c 100644 --- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp +++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp @@ -118,10 +118,14 @@ int main(int, char**) { // Make sure we fail in a SFINAE-friendly manner when we try to deduce // from a type without a valid call operator. template ()})> -constexpr bool can_deduce() { return true; } +constexpr bool can_deduce_test(int) { return true; } template -constexpr bool can_deduce(...) { return false; } +constexpr bool can_deduce_test(...) { return false; } +template +constexpr bool can_deduce = can_deduce_test(0); + +struct valid { int operator()() const; }; struct invalid1 { }; struct invalid2 { template @@ -131,6 +135,22 @@ struct invalid3 { void operator()(int); void operator()(long); }; -static_assert(!can_deduce()); -static_assert(!can_deduce()); -static_assert(!can_deduce()); +static_assert( can_deduce); +static_assert(!can_deduce); +static_assert(!can_deduce); +static_assert(!can_deduce); + + +// LWG 3238. Insufficiently-defined behavior of std::function deduction guides +// https://cplusplus.github.io/LWG/issue3238 +// The deduction guides for std::function do not handle rvalue-ref qualified +// call operators and C-style variadics. It also doesn't deduce from nullptr_t. +// Make sure we stick to the specification. + +struct invalid_rvalue_ref { R operator()() && { return {}; } }; +struct invalid_c_vararg { R operator()(int, ...) { return {}; } }; + +static_assert(!can_deduce); +static_assert(!can_deduce); +static_assert(!can_deduce); + diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp deleted file mode 100644 index 8a42d3be3571c0..00000000000000 --- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp +++ /dev/null @@ -1,30 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// - -// template -// function(F) -> function; - -// UNSUPPORTED: c++03, c++11, c++14 - -// The deduction guides for std::function do not handle rvalue-ref qualified -// call operators and C-style variadics. It also doesn't deduce from nullptr_t. -// Make sure we stick to the specification. - -#include - -struct R { }; -struct f0 { R operator()() && { return {}; } }; -struct f1 { R operator()(int, ...) { return {}; } }; - -void f() { - std::function f = f0{}; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'function'}} - std::function g = f1{}; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'function'}} - std::function h = nullptr; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'function'}} -} From 0380044e16a1c016e001a56c0ca7f4db649a6cae Mon Sep 17 00:00:00 2001 From: Jim Ingham Date: Tue, 28 May 2024 17:47:08 -0700 Subject: [PATCH 045/230] Fix the EditLine unittest build on Darwin after PR 92865 There was a Darwin only use of setupterm (under USE_SETUPTERM_WORKAROUND) that required libcurses.dylib. That was added to the main build, but not to the unittest. --- lldb/unittests/CMakeLists.txt | 4 +++- lldb/unittests/Editline/CMakeLists.txt | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lldb/unittests/CMakeLists.txt b/lldb/unittests/CMakeLists.txt index a2585a94b61558..728dec5006d6bf 100644 --- a/lldb/unittests/CMakeLists.txt +++ b/lldb/unittests/CMakeLists.txt @@ -51,11 +51,13 @@ if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows") # FIXME: APITests.exe is not a valid googletest binary. add_subdirectory(API) endif() +if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin" OR LLDB_ENABLE_CURSES) + add_subdirectory(Editline) +endif() add_subdirectory(Breakpoint) add_subdirectory(Core) add_subdirectory(DataFormatter) add_subdirectory(Disassembler) -add_subdirectory(Editline) add_subdirectory(Expression) add_subdirectory(Host) add_subdirectory(Interpreter) diff --git a/lldb/unittests/Editline/CMakeLists.txt b/lldb/unittests/Editline/CMakeLists.txt index 4b2643d15c5fc6..f213bfd1ab5813 100644 --- a/lldb/unittests/Editline/CMakeLists.txt +++ b/lldb/unittests/Editline/CMakeLists.txt @@ -5,4 +5,5 @@ add_lldb_unittest(EditlineTests lldbHost lldbUtility LLVMTestingSupport + ${CURSES_LIBRARIES} ) From d11922ebb26d84d7807be7f6fbf4d7e92c97455d Mon Sep 17 00:00:00 2001 From: Eric Fiselier Date: Tue, 28 May 2024 20:53:58 -0400 Subject: [PATCH 046/230] Remove unneeded debug logging --- .github/workflows/restart-preempted-libcxx-jobs.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml index 88924fb3cd7791..43a1b97f1947d1 100644 --- a/.github/workflows/restart-preempted-libcxx-jobs.yaml +++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml @@ -45,7 +45,6 @@ jobs: check_run_ids = []; for (check_run of check_suites.data.check_runs) { console.log('Checking check run: ' + check_run.id); - console.log(check_run); if (check_run.status != 'completed') { console.log('Check run was not completed. Skipping.'); continue; From f0b57b60e3b47bb9f9181d8be68473706b883430 Mon Sep 17 00:00:00 2001 From: "Ruiling, Song" Date: Wed, 29 May 2024 08:58:19 +0800 Subject: [PATCH 047/230] [Coroutines] Remove one construction of DominatorTree (#93507) The DominatorTree can be reused if no CFG changes. --- llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 38b8dab984db3a..8e829a53aeca27 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -2756,12 +2756,11 @@ static void sinkSpillUsesAfterCoroBegin(Function &F, /// after the suspend block. Doing so minimizes the lifetime of each variable, /// hence minimizing the amount of data we end up putting on the frame. static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, - SuspendCrossingInfo &Checker) { + SuspendCrossingInfo &Checker, + const DominatorTree &DT) { if (F.hasOptNone()) return; - DominatorTree DT(F); - // Collect all possible basic blocks which may dominate all uses of allocas. SmallPtrSet DomSet; DomSet.insert(&F.getEntryBlock()); @@ -3149,12 +3148,13 @@ void coro::buildCoroutineFrame( doRematerializations(F, Checker, MaterializableCallback); + const DominatorTree DT(F); FrameDataInfo FrameData; SmallVector LocalAllocas; SmallVector DeadInstructions; if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && Shape.ABI != coro::ABI::RetconOnce) - sinkLifetimeStartMarkers(F, Shape, Checker); + sinkLifetimeStartMarkers(F, Shape, Checker, DT); // Collect the spills for arguments and other not-materializable values. for (Argument &A : F.args()) @@ -3162,7 +3162,6 @@ void coro::buildCoroutineFrame( if (Checker.isDefinitionAcrossSuspend(A, U)) FrameData.Spills[&A].push_back(cast(U)); - const DominatorTree DT(F); for (Instruction &I : instructions(F)) { // Values returned from coroutine structure intrinsics should not be part // of the Coroutine Frame. From e492aa5adbccb9f4025af7c4179f75378fcad41a Mon Sep 17 00:00:00 2001 From: Eric Fiselier Date: Tue, 28 May 2024 21:07:55 -0400 Subject: [PATCH 048/230] Remove one more unneeded debug log line --- .github/workflows/restart-preempted-libcxx-jobs.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml index 43a1b97f1947d1..71e27ff2abb9f0 100644 --- a/.github/workflows/restart-preempted-libcxx-jobs.yaml +++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml @@ -67,7 +67,6 @@ jobs: check_run_id: check_run_id }) - console.log(annotations); for (annotation of annotations.data) { if (annotation.annotation_level != 'failure') { continue; From f9672cb775afc47e5210a111d248a01c23c428fe Mon Sep 17 00:00:00 2001 From: yronglin Date: Wed, 29 May 2024 09:09:36 +0800 Subject: [PATCH 049/230] [NFC][libc++] Mark LWG3951 as implemented (#93191) Since we have already addressed the LWG issue, this PR marks LWG3951 as implemented. Signed-off-by: yronglin Co-authored-by: A. Jiang --- libcxx/docs/Status/Cxx2cIssues.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv index 76717e1d3448a5..8d24457186310c 100644 --- a/libcxx/docs/Status/Cxx2cIssues.csv +++ b/libcxx/docs/Status/Cxx2cIssues.csv @@ -29,7 +29,7 @@ "`3947 `__","Unexpected constraints on ``adjacent_transform_view::base()``","Kona November 2023","","","|ranges|" "`3948 `__","``possibly-const-range and as-const-pointer`` should be ``noexcept``","Kona November 2023","","","|ranges|" "`3949 `__","``std::atomic``'s trivial destructor dropped in C++17 spec wording","Kona November 2023","","","" -"`3951 `__","[expected.object.swap]: Using ``value()`` instead of ``has_value()``","Kona November 2023","","","" +"`3951 `__","[expected.object.swap]: Using ``value()`` instead of ``has_value()``","Kona November 2023","|Complete|","16.0","" "`3953 `__","``iter_move`` for ``common_iterator`` and ``counted_iterator`` should return ``decltype(auto)``","Kona November 2023","","","|ranges|" "`3957 `__","[container.alloc.reqmts] The value category of v should be claimed","Kona November 2023","","","" "`3965 `__","Incorrect example in [format.string.escaped] p3 for formatting of combining characters","Kona November 2023","|Complete|","19.0","|format|" From 6abc3876c35bbe8fb5dd6435dc60f2c816b97ef6 Mon Sep 17 00:00:00 2001 From: Jim Ingham Date: Tue, 28 May 2024 18:16:13 -0700 Subject: [PATCH 050/230] Revert "Fix the EditLine unittest build on Darwin after PR 92865" This reverts commit 0380044e16a1c016e001a56c0ca7f4db649a6cae. While I figure out some mysterious CMake error. --- lldb/unittests/CMakeLists.txt | 4 +--- lldb/unittests/Editline/CMakeLists.txt | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/lldb/unittests/CMakeLists.txt b/lldb/unittests/CMakeLists.txt index 728dec5006d6bf..a2585a94b61558 100644 --- a/lldb/unittests/CMakeLists.txt +++ b/lldb/unittests/CMakeLists.txt @@ -51,13 +51,11 @@ if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows") # FIXME: APITests.exe is not a valid googletest binary. add_subdirectory(API) endif() -if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin" OR LLDB_ENABLE_CURSES) - add_subdirectory(Editline) -endif() add_subdirectory(Breakpoint) add_subdirectory(Core) add_subdirectory(DataFormatter) add_subdirectory(Disassembler) +add_subdirectory(Editline) add_subdirectory(Expression) add_subdirectory(Host) add_subdirectory(Interpreter) diff --git a/lldb/unittests/Editline/CMakeLists.txt b/lldb/unittests/Editline/CMakeLists.txt index f213bfd1ab5813..4b2643d15c5fc6 100644 --- a/lldb/unittests/Editline/CMakeLists.txt +++ b/lldb/unittests/Editline/CMakeLists.txt @@ -5,5 +5,4 @@ add_lldb_unittest(EditlineTests lldbHost lldbUtility LLVMTestingSupport - ${CURSES_LIBRARIES} ) From 04f01a2b9cedc291fa7dd941de841dc957c75a33 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 28 May 2024 18:29:11 -0700 Subject: [PATCH 051/230] [libc++] Make the __availability header a sub-header of __config (#93083) In essence, this header has always been related to configuration of the library but we didn't want to put it inside <__config> due to complexity reasons. Now that we have sub-headers in <__config>, we can move <__availability> to it and stop including it everywhere since we already obtain the required macros via <__config>. --- libcxx/CMakeLists.txt | 2 +- libcxx/include/CMakeLists.txt | 3 +- libcxx/include/__atomic/atomic_base.h | 1 - libcxx/include/__atomic/atomic_flag.h | 1 - libcxx/include/__atomic/atomic_sync.h | 1 - .../__charconv/to_chars_floating_point.h | 1 - libcxx/include/__chrono/file_clock.h | 1 - libcxx/include/__chrono/tzdb_list.h | 1 - libcxx/include/__config | 29 +----------- .../availability.h} | 9 ++-- libcxx/include/__configuration/language.h | 46 +++++++++++++++++++ libcxx/include/__exception/exception_ptr.h | 1 - .../include/__expected/bad_expected_access.h | 1 - libcxx/include/__filesystem/directory_entry.h | 1 - .../include/__filesystem/directory_iterator.h | 1 - .../include/__filesystem/filesystem_error.h | 1 - libcxx/include/__filesystem/operations.h | 1 - libcxx/include/__filesystem/path.h | 1 - libcxx/include/__filesystem/path_iterator.h | 1 - .../recursive_directory_iterator.h | 1 - libcxx/include/__filesystem/u8path.h | 1 - libcxx/include/__functional/function.h | 1 - libcxx/include/__fwd/memory_resource.h | 1 - libcxx/include/__fwd/string.h | 1 - .../__memory_resource/memory_resource.h | 1 - .../monotonic_buffer_resource.h | 1 - .../__memory_resource/polymorphic_allocator.h | 1 - .../synchronized_pool_resource.h | 1 - .../unsynchronized_pool_resource.h | 1 - libcxx/include/__ostream/print.h | 1 - libcxx/include/__stop_token/stop_callback.h | 1 - libcxx/include/__stop_token/stop_source.h | 1 - libcxx/include/__stop_token/stop_state.h | 1 - libcxx/include/__stop_token/stop_token.h | 1 - libcxx/include/__thread/jthread.h | 1 - libcxx/include/__thread/poll_with_backoff.h | 1 - libcxx/include/__verbose_abort | 1 - libcxx/include/barrier | 1 - libcxx/include/condition_variable | 1 - libcxx/include/deque | 1 - libcxx/include/forward_list | 1 - libcxx/include/fstream | 1 - libcxx/include/latch | 1 - libcxx/include/list | 1 - libcxx/include/map | 1 - libcxx/include/module.modulemap | 6 +-- libcxx/include/optional | 1 - libcxx/include/print | 1 - libcxx/include/regex | 1 - libcxx/include/semaphore | 1 - libcxx/include/set | 1 - libcxx/include/sstream | 1 - libcxx/include/unordered_map | 1 - libcxx/include/unordered_set | 1 - libcxx/include/variant | 1 - libcxx/include/vector | 1 - libcxx/include/version | 1 - libcxx/src/optional.cpp | 1 - libcxx/src/ostream.cpp | 1 - ...lity-with-pedantic-errors.compile.pass.cpp | 2 +- .../generate_feature_test_macro_components.py | 4 +- 61 files changed, 60 insertions(+), 94 deletions(-) rename libcxx/include/{__availability => __configuration/availability.h} (98%) create mode 100644 libcxx/include/__configuration/language.h diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index cb5e0e5e6cdb56..bbde9abc57919e 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -122,7 +122,7 @@ option(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS on definitions in a shared library. By default, we assume that we're not building libc++ for any specific vendor, and we disable those annotations. Vendors wishing to provide compile-time errors when using features unavailable on some version of - the shared library they shipped should turn this on and see `include/__availability` + the shared library they shipped should turn this on and see `include/__configuration/availability.h` for more details." OFF) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 161d7a7d215bdd..cfe1f44777bcac 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -219,7 +219,6 @@ set(files __atomic/kill_dependency.h __atomic/memory_order.h __atomic/to_gcc_order.h - __availability __bit/bit_cast.h __bit/bit_ceil.h __bit/bit_floor.h @@ -315,7 +314,9 @@ set(files __condition_variable/condition_variable.h __config __configuration/abi.h + __configuration/availability.h __configuration/compiler.h + __configuration/language.h __configuration/platform.h __coroutine/coroutine_handle.h __coroutine/coroutine_traits.h diff --git a/libcxx/include/__atomic/atomic_base.h b/libcxx/include/__atomic/atomic_base.h index e9badccc25a620..d7a5b99b546910 100644 --- a/libcxx/include/__atomic/atomic_base.h +++ b/libcxx/include/__atomic/atomic_base.h @@ -14,7 +14,6 @@ #include <__atomic/cxx_atomic_impl.h> #include <__atomic/is_always_lock_free.h> #include <__atomic/memory_order.h> -#include <__availability> #include <__config> #include <__memory/addressof.h> #include <__type_traits/is_integral.h> diff --git a/libcxx/include/__atomic/atomic_flag.h b/libcxx/include/__atomic/atomic_flag.h index 3ec3366ecaaf98..00b157cdff78b7 100644 --- a/libcxx/include/__atomic/atomic_flag.h +++ b/libcxx/include/__atomic/atomic_flag.h @@ -13,7 +13,6 @@ #include <__atomic/contention_t.h> #include <__atomic/cxx_atomic_impl.h> #include <__atomic/memory_order.h> -#include <__availability> #include <__chrono/duration.h> #include <__config> #include <__memory/addressof.h> diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h index 175700be54c010..1de5037329f812 100644 --- a/libcxx/include/__atomic/atomic_sync.h +++ b/libcxx/include/__atomic/atomic_sync.h @@ -13,7 +13,6 @@ #include <__atomic/cxx_atomic_impl.h> #include <__atomic/memory_order.h> #include <__atomic/to_gcc_order.h> -#include <__availability> #include <__chrono/duration.h> #include <__config> #include <__memory/addressof.h> diff --git a/libcxx/include/__charconv/to_chars_floating_point.h b/libcxx/include/__charconv/to_chars_floating_point.h index 08720e1078852b..118f316b21a102 100644 --- a/libcxx/include/__charconv/to_chars_floating_point.h +++ b/libcxx/include/__charconv/to_chars_floating_point.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___CHARCONV_TO_CHARS_FLOATING_POINT_H #define _LIBCPP___CHARCONV_TO_CHARS_FLOATING_POINT_H -#include <__availability> #include <__charconv/chars_format.h> #include <__charconv/to_chars_result.h> #include <__config> diff --git a/libcxx/include/__chrono/file_clock.h b/libcxx/include/__chrono/file_clock.h index 7d25729fec013a..4dd3f88ce5ba4b 100644 --- a/libcxx/include/__chrono/file_clock.h +++ b/libcxx/include/__chrono/file_clock.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___CHRONO_FILE_CLOCK_H #define _LIBCPP___CHRONO_FILE_CLOCK_H -#include <__availability> #include <__chrono/duration.h> #include <__chrono/system_clock.h> #include <__chrono/time_point.h> diff --git a/libcxx/include/__chrono/tzdb_list.h b/libcxx/include/__chrono/tzdb_list.h index 62db7e3d2e0b5e..aeef4fe1aba3c1 100644 --- a/libcxx/include/__chrono/tzdb_list.h +++ b/libcxx/include/__chrono/tzdb_list.h @@ -16,7 +16,6 @@ // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__availability> # include <__chrono/time_zone.h> # include <__chrono/tzdb.h> # include <__config> diff --git a/libcxx/include/__config b/libcxx/include/__config index e048dad52c4664..79422e8f6c5d1b 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -12,6 +12,7 @@ #include <__config_site> #include <__configuration/abi.h> +#include <__configuration/availability.h> #include <__configuration/compiler.h> #include <__configuration/platform.h> @@ -35,25 +36,6 @@ # define _LIBCPP_FREESTANDING # endif -// NOLINTBEGIN(libcpp-cpp-version-check) -# ifndef _LIBCPP_STD_VER -# if __cplusplus <= 201103L -# define _LIBCPP_STD_VER 11 -# elif __cplusplus <= 201402L -# define _LIBCPP_STD_VER 14 -# elif __cplusplus <= 201703L -# define _LIBCPP_STD_VER 17 -# elif __cplusplus <= 202002L -# define _LIBCPP_STD_VER 20 -# elif __cplusplus <= 202302L -# define _LIBCPP_STD_VER 23 -# else -// Expected release year of the next C++ standard -# define _LIBCPP_STD_VER 26 -# endif -# endif // _LIBCPP_STD_VER -// NOLINTEND(libcpp-cpp-version-check) - // HARDENING { // TODO(hardening): deprecate this in LLVM 19. @@ -364,10 +346,6 @@ typedef __char32_t char32_t; # endif -# if !defined(__cpp_exceptions) || __cpp_exceptions < 199711L -# define _LIBCPP_HAS_NO_EXCEPTIONS -# endif - # define _LIBCPP_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp) # if defined(_LIBCPP_COMPILER_CLANG_BASED) @@ -840,11 +818,6 @@ typedef __char32_t char32_t; # define _LIBCPP_CONSTEXPR_SINCE_CXX23 # endif -// Try to find out if RTTI is disabled. -# if !defined(__cpp_rtti) || __cpp_rtti < 199711L -# define _LIBCPP_HAS_NO_RTTI -# endif - # ifndef _LIBCPP_WEAK # define _LIBCPP_WEAK __attribute__((__weak__)) # endif diff --git a/libcxx/include/__availability b/libcxx/include/__configuration/availability.h similarity index 98% rename from libcxx/include/__availability rename to libcxx/include/__configuration/availability.h index e44ac1962df363..1115431115ec38 100644 --- a/libcxx/include/__availability +++ b/libcxx/include/__configuration/availability.h @@ -7,10 +7,11 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCPP___AVAILABILITY -#define _LIBCPP___AVAILABILITY +#ifndef _LIBCPP___CONFIGURATION_AVAILABILITY_H +#define _LIBCPP___CONFIGURATION_AVAILABILITY_H -#include <__config> +#include <__configuration/compiler.h> +#include <__configuration/language.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -374,4 +375,4 @@ # define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION #endif -#endif // _LIBCPP___AVAILABILITY +#endif // _LIBCPP___CONFIGURATION_AVAILABILITY_H diff --git a/libcxx/include/__configuration/language.h b/libcxx/include/__configuration/language.h new file mode 100644 index 00000000000000..fa62a7b6f5c2a1 --- /dev/null +++ b/libcxx/include/__configuration/language.h @@ -0,0 +1,46 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___CONFIGURATION_LANGUAGE_H +#define _LIBCPP___CONFIGURATION_LANGUAGE_H + +#include <__config_site> + +#ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER +# pragma GCC system_header +#endif + +// NOLINTBEGIN(libcpp-cpp-version-check) +#ifdef __cplusplus +# if __cplusplus <= 201103L +# define _LIBCPP_STD_VER 11 +# elif __cplusplus <= 201402L +# define _LIBCPP_STD_VER 14 +# elif __cplusplus <= 201703L +# define _LIBCPP_STD_VER 17 +# elif __cplusplus <= 202002L +# define _LIBCPP_STD_VER 20 +# elif __cplusplus <= 202302L +# define _LIBCPP_STD_VER 23 +# else +// Expected release year of the next C++ standard +# define _LIBCPP_STD_VER 26 +# endif +#endif // __cplusplus +// NOLINTEND(libcpp-cpp-version-check) + +#if !defined(__cpp_rtti) || __cpp_rtti < 199711L +# define _LIBCPP_HAS_NO_RTTI +#endif + +#if !defined(__cpp_exceptions) || __cpp_exceptions < 199711L +# define _LIBCPP_HAS_NO_EXCEPTIONS +#endif + +#endif // _LIBCPP___CONFIGURATION_LANGUAGE_H diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h index 868fd7c015339c..0a8337fa39de39 100644 --- a/libcxx/include/__exception/exception_ptr.h +++ b/libcxx/include/__exception/exception_ptr.h @@ -9,7 +9,6 @@ #ifndef _LIBCPP___EXCEPTION_EXCEPTION_PTR_H #define _LIBCPP___EXCEPTION_EXCEPTION_PTR_H -#include <__availability> #include <__config> #include <__exception/operations.h> #include <__memory/addressof.h> diff --git a/libcxx/include/__expected/bad_expected_access.h b/libcxx/include/__expected/bad_expected_access.h index ef29fa50883136..1b734389e8311f 100644 --- a/libcxx/include/__expected/bad_expected_access.h +++ b/libcxx/include/__expected/bad_expected_access.h @@ -9,7 +9,6 @@ #ifndef _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H #define _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H -#include <__availability> #include <__config> #include <__exception/exception.h> #include <__utility/move.h> diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h index 016ad94a853dc2..96d88dcd90b4c0 100644 --- a/libcxx/include/__filesystem/directory_entry.h +++ b/libcxx/include/__filesystem/directory_entry.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___FILESYSTEM_DIRECTORY_ENTRY_H #define _LIBCPP___FILESYSTEM_DIRECTORY_ENTRY_H -#include <__availability> #include <__chrono/time_point.h> #include <__compare/ordering.h> #include <__config> diff --git a/libcxx/include/__filesystem/directory_iterator.h b/libcxx/include/__filesystem/directory_iterator.h index a5aa5ff5432dab..e0246d8001e195 100644 --- a/libcxx/include/__filesystem/directory_iterator.h +++ b/libcxx/include/__filesystem/directory_iterator.h @@ -11,7 +11,6 @@ #define _LIBCPP___FILESYSTEM_DIRECTORY_ITERATOR_H #include <__assert> -#include <__availability> #include <__config> #include <__filesystem/directory_entry.h> #include <__filesystem/directory_options.h> diff --git a/libcxx/include/__filesystem/filesystem_error.h b/libcxx/include/__filesystem/filesystem_error.h index bfdcc5eaee521f..80a11e3b1932c7 100644 --- a/libcxx/include/__filesystem/filesystem_error.h +++ b/libcxx/include/__filesystem/filesystem_error.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___FILESYSTEM_FILESYSTEM_ERROR_H #define _LIBCPP___FILESYSTEM_FILESYSTEM_ERROR_H -#include <__availability> #include <__config> #include <__filesystem/path.h> #include <__memory/shared_ptr.h> diff --git a/libcxx/include/__filesystem/operations.h b/libcxx/include/__filesystem/operations.h index 9bb83576f54bc0..f588189ed1d9de 100644 --- a/libcxx/include/__filesystem/operations.h +++ b/libcxx/include/__filesystem/operations.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___FILESYSTEM_OPERATIONS_H #define _LIBCPP___FILESYSTEM_OPERATIONS_H -#include <__availability> #include <__chrono/time_point.h> #include <__config> #include <__filesystem/copy_options.h> diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h index 89d319b4b19b57..ff468d517722fe 100644 --- a/libcxx/include/__filesystem/path.h +++ b/libcxx/include/__filesystem/path.h @@ -12,7 +12,6 @@ #include <__algorithm/replace.h> #include <__algorithm/replace_copy.h> -#include <__availability> #include <__config> #include <__functional/unary_function.h> #include <__fwd/functional.h> diff --git a/libcxx/include/__filesystem/path_iterator.h b/libcxx/include/__filesystem/path_iterator.h index d2d65cd122cab8..f4d486d86cf380 100644 --- a/libcxx/include/__filesystem/path_iterator.h +++ b/libcxx/include/__filesystem/path_iterator.h @@ -11,7 +11,6 @@ #define _LIBCPP___FILESYSTEM_PATH_ITERATOR_H #include <__assert> -#include <__availability> #include <__config> #include <__filesystem/path.h> #include <__iterator/iterator_traits.h> diff --git a/libcxx/include/__filesystem/recursive_directory_iterator.h b/libcxx/include/__filesystem/recursive_directory_iterator.h index a8af4f73b14a5f..caa1396eb301fc 100644 --- a/libcxx/include/__filesystem/recursive_directory_iterator.h +++ b/libcxx/include/__filesystem/recursive_directory_iterator.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H #define _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H -#include <__availability> #include <__config> #include <__filesystem/directory_entry.h> #include <__filesystem/directory_options.h> diff --git a/libcxx/include/__filesystem/u8path.h b/libcxx/include/__filesystem/u8path.h index bde878054865e1..dae5823128f028 100644 --- a/libcxx/include/__filesystem/u8path.h +++ b/libcxx/include/__filesystem/u8path.h @@ -11,7 +11,6 @@ #define _LIBCPP___FILESYSTEM_U8PATH_H #include <__algorithm/unwrap_iter.h> -#include <__availability> #include <__config> #include <__filesystem/path.h> #include diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h index 36057706933d43..244e55be3403ca 100644 --- a/libcxx/include/__functional/function.h +++ b/libcxx/include/__functional/function.h @@ -11,7 +11,6 @@ #define _LIBCPP___FUNCTIONAL_FUNCTION_H #include <__assert> -#include <__availability> #include <__config> #include <__exception/exception.h> #include <__functional/binary_function.h> diff --git a/libcxx/include/__fwd/memory_resource.h b/libcxx/include/__fwd/memory_resource.h index 03b78ad2bd3c0c..d68b2c2b631543 100644 --- a/libcxx/include/__fwd/memory_resource.h +++ b/libcxx/include/__fwd/memory_resource.h @@ -9,7 +9,6 @@ #ifndef _LIBCPP___FWD_MEMORY_RESOURCE_H #define _LIBCPP___FWD_MEMORY_RESOURCE_H -#include <__availability> #include <__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__fwd/string.h b/libcxx/include/__fwd/string.h index 320c4e4c818361..2418e1f9b23d0d 100644 --- a/libcxx/include/__fwd/string.h +++ b/libcxx/include/__fwd/string.h @@ -9,7 +9,6 @@ #ifndef _LIBCPP___FWD_STRING_H #define _LIBCPP___FWD_STRING_H -#include <__availability> #include <__config> #include <__fwd/memory.h> #include <__fwd/memory_resource.h> diff --git a/libcxx/include/__memory_resource/memory_resource.h b/libcxx/include/__memory_resource/memory_resource.h index e605838bf5ea40..ea85e50cd568bc 100644 --- a/libcxx/include/__memory_resource/memory_resource.h +++ b/libcxx/include/__memory_resource/memory_resource.h @@ -9,7 +9,6 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_MEMORY_RESOURCE_H #define _LIBCPP___MEMORY_RESOURCE_MEMORY_RESOURCE_H -#include <__availability> #include <__config> #include <__fwd/memory_resource.h> #include diff --git a/libcxx/include/__memory_resource/monotonic_buffer_resource.h b/libcxx/include/__memory_resource/monotonic_buffer_resource.h index 0c83f1ebc8db43..f45b30fdb38616 100644 --- a/libcxx/include/__memory_resource/monotonic_buffer_resource.h +++ b/libcxx/include/__memory_resource/monotonic_buffer_resource.h @@ -9,7 +9,6 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_MONOTONIC_BUFFER_RESOURCE_H #define _LIBCPP___MEMORY_RESOURCE_MONOTONIC_BUFFER_RESOURCE_H -#include <__availability> #include <__config> #include <__memory/addressof.h> #include <__memory_resource/memory_resource.h> diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h index 8fda201124387e..a71096d3e47847 100644 --- a/libcxx/include/__memory_resource/polymorphic_allocator.h +++ b/libcxx/include/__memory_resource/polymorphic_allocator.h @@ -10,7 +10,6 @@ #define _LIBCPP___MEMORY_RESOURCE_POLYMORPHIC_ALLOCATOR_H #include <__assert> -#include <__availability> #include <__config> #include <__fwd/pair.h> #include <__memory_resource/memory_resource.h> diff --git a/libcxx/include/__memory_resource/synchronized_pool_resource.h b/libcxx/include/__memory_resource/synchronized_pool_resource.h index b261fb0b194a8e..50a673c2861d10 100644 --- a/libcxx/include/__memory_resource/synchronized_pool_resource.h +++ b/libcxx/include/__memory_resource/synchronized_pool_resource.h @@ -9,7 +9,6 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_SYNCHRONIZED_POOL_RESOURCE_H #define _LIBCPP___MEMORY_RESOURCE_SYNCHRONIZED_POOL_RESOURCE_H -#include <__availability> #include <__config> #include <__memory_resource/memory_resource.h> #include <__memory_resource/pool_options.h> diff --git a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h index 81d5f9ec4da87d..783db84262af72 100644 --- a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h +++ b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h @@ -9,7 +9,6 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_UNSYNCHRONIZED_POOL_RESOURCE_H #define _LIBCPP___MEMORY_RESOURCE_UNSYNCHRONIZED_POOL_RESOURCE_H -#include <__availability> #include <__config> #include <__memory_resource/memory_resource.h> #include <__memory_resource/pool_options.h> diff --git a/libcxx/include/__ostream/print.h b/libcxx/include/__ostream/print.h index 97680cdab6da3c..8265ac00777e25 100644 --- a/libcxx/include/__ostream/print.h +++ b/libcxx/include/__ostream/print.h @@ -9,7 +9,6 @@ #ifndef _LIBCPP___OSTREAM_PRINT_H #define _LIBCPP___OSTREAM_PRINT_H -#include <__availability> #include <__config> #include <__fwd/ostream.h> #include <__iterator/ostreambuf_iterator.h> diff --git a/libcxx/include/__stop_token/stop_callback.h b/libcxx/include/__stop_token/stop_callback.h index 7b526820f98a37..760cf2bb55b0ce 100644 --- a/libcxx/include/__stop_token/stop_callback.h +++ b/libcxx/include/__stop_token/stop_callback.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H #define _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H -#include <__availability> #include <__concepts/constructible.h> #include <__concepts/destructible.h> #include <__concepts/invocable.h> diff --git a/libcxx/include/__stop_token/stop_source.h b/libcxx/include/__stop_token/stop_source.h index 1080069cf3b8be..70697462784ab4 100644 --- a/libcxx/include/__stop_token/stop_source.h +++ b/libcxx/include/__stop_token/stop_source.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___STOP_TOKEN_STOP_SOURCE_H #define _LIBCPP___STOP_TOKEN_STOP_SOURCE_H -#include <__availability> #include <__config> #include <__stop_token/intrusive_shared_ptr.h> #include <__stop_token/stop_state.h> diff --git a/libcxx/include/__stop_token/stop_state.h b/libcxx/include/__stop_token/stop_state.h index df07573f878628..b0eed13a143cfc 100644 --- a/libcxx/include/__stop_token/stop_state.h +++ b/libcxx/include/__stop_token/stop_state.h @@ -11,7 +11,6 @@ #define _LIBCPP___STOP_TOKEN_STOP_STATE_H #include <__assert> -#include <__availability> #include <__config> #include <__stop_token/atomic_unique_lock.h> #include <__stop_token/intrusive_list_view.h> diff --git a/libcxx/include/__stop_token/stop_token.h b/libcxx/include/__stop_token/stop_token.h index f2eadb990bdeca..1bd75cbbf6f8d8 100644 --- a/libcxx/include/__stop_token/stop_token.h +++ b/libcxx/include/__stop_token/stop_token.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___STOP_TOKEN_STOP_TOKEN_H #define _LIBCPP___STOP_TOKEN_STOP_TOKEN_H -#include <__availability> #include <__config> #include <__stop_token/intrusive_shared_ptr.h> #include <__stop_token/stop_state.h> diff --git a/libcxx/include/__thread/jthread.h b/libcxx/include/__thread/jthread.h index 253e3a935d9b73..b3d5c25fb71c77 100644 --- a/libcxx/include/__thread/jthread.h +++ b/libcxx/include/__thread/jthread.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___THREAD_JTHREAD_H #define _LIBCPP___THREAD_JTHREAD_H -#include <__availability> #include <__config> #include <__functional/invoke.h> #include <__stop_token/stop_source.h> diff --git a/libcxx/include/__thread/poll_with_backoff.h b/libcxx/include/__thread/poll_with_backoff.h index d8354e6ca23980..4f961fe3f7629f 100644 --- a/libcxx/include/__thread/poll_with_backoff.h +++ b/libcxx/include/__thread/poll_with_backoff.h @@ -10,7 +10,6 @@ #ifndef _LIBCPP___THREAD_POLL_WITH_BACKOFF_H #define _LIBCPP___THREAD_POLL_WITH_BACKOFF_H -#include <__availability> #include <__chrono/duration.h> #include <__chrono/high_resolution_clock.h> #include <__config> diff --git a/libcxx/include/__verbose_abort b/libcxx/include/__verbose_abort index 259c70dda8fe83..1e2265a6bf7558 100644 --- a/libcxx/include/__verbose_abort +++ b/libcxx/include/__verbose_abort @@ -10,7 +10,6 @@ #ifndef _LIBCPP___VERBOSE_ABORT #define _LIBCPP___VERBOSE_ABORT -#include <__availability> #include <__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/barrier b/libcxx/include/barrier index a6b4d2288309e3..bce67bb5d34250 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -54,7 +54,6 @@ namespace std #include <__assert> #include <__atomic/atomic_base.h> #include <__atomic/memory_order.h> -#include <__availability> #include <__memory/unique_ptr.h> #include <__thread/poll_with_backoff.h> #include <__thread/timed_backoff_policy.h> diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable index 4ded1140d46b1b..5195cd6057dd33 100644 --- a/libcxx/include/condition_variable +++ b/libcxx/include/condition_variable @@ -118,7 +118,6 @@ public: */ -#include <__availability> #include <__chrono/duration.h> #include <__chrono/steady_clock.h> #include <__chrono/time_point.h> diff --git a/libcxx/include/deque b/libcxx/include/deque index 3c33e04e9f05f8..555761aae6afd2 100644 --- a/libcxx/include/deque +++ b/libcxx/include/deque @@ -189,7 +189,6 @@ template #include <__algorithm/remove_if.h> #include <__algorithm/unwrap_iter.h> #include <__assert> -#include <__availability> #include <__config> #include <__debug_utils/sanitizers.h> #include <__format/enable_insertable.h> diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index 80dd49fe3d75a0..363931e3f23881 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -199,7 +199,6 @@ template #include <__algorithm/lexicographical_compare.h> #include <__algorithm/lexicographical_compare_three_way.h> #include <__algorithm/min.h> -#include <__availability> #include <__config> #include <__iterator/distance.h> #include <__iterator/iterator_traits.h> diff --git a/libcxx/include/fstream b/libcxx/include/fstream index 7128f72e161193..18f4dd3eed0b23 100644 --- a/libcxx/include/fstream +++ b/libcxx/include/fstream @@ -188,7 +188,6 @@ typedef basic_fstream wfstream; #include <__algorithm/max.h> #include <__assert> -#include <__availability> #include <__config> #include <__fwd/fstream.h> #include <__locale> diff --git a/libcxx/include/latch b/libcxx/include/latch index 1937617f7dcc61..da8dae149c79f3 100644 --- a/libcxx/include/latch +++ b/libcxx/include/latch @@ -50,7 +50,6 @@ namespace std #include <__atomic/atomic_base.h> #include <__atomic/atomic_sync.h> #include <__atomic/memory_order.h> -#include <__availability> #include #include #include diff --git a/libcxx/include/list b/libcxx/include/list index 610a24e384600e..87f15e144ac8f2 100644 --- a/libcxx/include/list +++ b/libcxx/include/list @@ -203,7 +203,6 @@ template #include <__algorithm/lexicographical_compare_three_way.h> #include <__algorithm/min.h> #include <__assert> -#include <__availability> #include <__config> #include <__format/enable_insertable.h> #include <__iterator/distance.h> diff --git a/libcxx/include/map b/libcxx/include/map index 1d1c062a0267c0..7efa715e84aa7e 100644 --- a/libcxx/include/map +++ b/libcxx/include/map @@ -575,7 +575,6 @@ erase_if(multimap& c, Predicate pred); // C++20 #include <__algorithm/lexicographical_compare.h> #include <__algorithm/lexicographical_compare_three_way.h> #include <__assert> -#include <__availability> #include <__config> #include <__functional/binary_function.h> #include <__functional/is_transparent.h> diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 1f7c2a183f63d0..48391b2a12095d 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -570,10 +570,6 @@ module std_private_assert [system] { header "__assert" export * } -module std_private_availability [system] { - header "__availability" - export * -} module std_private_bit_reference [system] { header "__bit_reference" export * @@ -584,7 +580,9 @@ module std_private_fwd_bit_reference [system] { module std_private_config [system] { textual header "__config" textual header "__configuration/abi.h" + textual header "__configuration/availability.h" textual header "__configuration/compiler.h" + textual header "__configuration/language.h" textual header "__configuration/platform.h" export * } diff --git a/libcxx/include/optional b/libcxx/include/optional index a16e48502e2509..622e150f7a9f7c 100644 --- a/libcxx/include/optional +++ b/libcxx/include/optional @@ -178,7 +178,6 @@ namespace std { */ #include <__assert> -#include <__availability> #include <__compare/compare_three_way_result.h> #include <__compare/three_way_comparable.h> #include <__concepts/invocable.h> diff --git a/libcxx/include/print b/libcxx/include/print index e0bcf214ea239b..5bdaa559af7242 100644 --- a/libcxx/include/print +++ b/libcxx/include/print @@ -34,7 +34,6 @@ namespace std { */ #include <__assert> -#include <__availability> #include <__concepts/same_as.h> #include <__config> #include <__system_error/system_error.h> diff --git a/libcxx/include/regex b/libcxx/include/regex index ce9f34260254a0..b3869d36de1dfb 100644 --- a/libcxx/include/regex +++ b/libcxx/include/regex @@ -792,7 +792,6 @@ typedef regex_token_iterator wsregex_token_iterator; #include <__algorithm/find.h> #include <__algorithm/search.h> #include <__assert> -#include <__availability> #include <__config> #include <__iterator/back_insert_iterator.h> #include <__iterator/default_sentinel.h> diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore index cb2f42c106ca85..8d3b04475c092d 100644 --- a/libcxx/include/semaphore +++ b/libcxx/include/semaphore @@ -55,7 +55,6 @@ using binary_semaphore = counting_semaphore<1>; #include <__atomic/atomic_base.h> #include <__atomic/atomic_sync.h> #include <__atomic/memory_order.h> -#include <__availability> #include <__chrono/time_point.h> #include <__thread/poll_with_backoff.h> #include <__thread/support.h> diff --git a/libcxx/include/set b/libcxx/include/set index d9377ee6c33224..ab3a4363499af9 100644 --- a/libcxx/include/set +++ b/libcxx/include/set @@ -516,7 +516,6 @@ erase_if(multiset& c, Predicate pred); // C++20 #include <__algorithm/lexicographical_compare.h> #include <__algorithm/lexicographical_compare_three_way.h> #include <__assert> -#include <__availability> #include <__config> #include <__functional/is_transparent.h> #include <__functional/operations.h> diff --git a/libcxx/include/sstream b/libcxx/include/sstream index 5009fe5c0057be..9ba43ffeb850f2 100644 --- a/libcxx/include/sstream +++ b/libcxx/include/sstream @@ -312,7 +312,6 @@ typedef basic_stringstream wstringstream; // clang-format on -#include <__availability> #include <__config> #include <__fwd/sstream.h> #include <__ostream/basic_ostream.h> diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map index c838cd96b1123e..2e25b0f0506956 100644 --- a/libcxx/include/unordered_map +++ b/libcxx/include/unordered_map @@ -585,7 +585,6 @@ template #include <__algorithm/is_permutation.h> #include <__assert> -#include <__availability> #include <__config> #include <__functional/is_transparent.h> #include <__functional/operations.h> diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set index 5de1458beb1e6a..c966cc8eb4df1b 100644 --- a/libcxx/include/unordered_set +++ b/libcxx/include/unordered_set @@ -533,7 +533,6 @@ template #include <__algorithm/is_permutation.h> #include <__assert> -#include <__availability> #include <__config> #include <__functional/is_transparent.h> #include <__functional/operations.h> diff --git a/libcxx/include/variant b/libcxx/include/variant index 631ffceab5f68f..7ebd0534b16414 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -212,7 +212,6 @@ namespace std { */ -#include <__availability> #include <__compare/common_comparison_category.h> #include <__compare/compare_three_way_result.h> #include <__compare/three_way_comparable.h> diff --git a/libcxx/include/vector b/libcxx/include/vector index b190557fb7b7e8..cbfc2cefa1fd93 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -316,7 +316,6 @@ template requires is-vector-bool-reference // Since C++ #include <__algorithm/rotate.h> #include <__algorithm/unwrap_iter.h> #include <__assert> -#include <__availability> #include <__bit_reference> #include <__concepts/same_as.h> #include <__config> diff --git a/libcxx/include/version b/libcxx/include/version index 140a9a0d870360..d433e1b1c9cea0 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -255,7 +255,6 @@ __cpp_lib_void_t 201411L */ -#include <__availability> #include <__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/src/optional.cpp b/libcxx/src/optional.cpp index 6ba63f2d89f5a5..62b474a312be2d 100644 --- a/libcxx/src/optional.cpp +++ b/libcxx/src/optional.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include <__availability> #include #include diff --git a/libcxx/src/ostream.cpp b/libcxx/src/ostream.cpp index 443dce9a390bee..e1a9a4bc1de718 100644 --- a/libcxx/src/ostream.cpp +++ b/libcxx/src/ostream.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include <__availability> #include <__config> #ifndef _LIBCPP_HAS_NO_FILESYSTEM # include diff --git a/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp index c55a0a4d6e5d1b..60723bf7b6e971 100644 --- a/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp +++ b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp @@ -15,7 +15,7 @@ // ADDITIONAL_COMPILE_FLAGS: -pedantic-errors -#include <__availability> +#include <__config> #if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS) # error Availability annotations should be enabled on Apple platforms in the system configuration! diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 1e79f6c140758c..490ecefc975222 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -61,7 +61,8 @@ def add_version_header(tc): # just libc++. It may depend on # * macros defined by the compiler itself, or # * macros generated by CMake. -# In some cases we add also depend on macros defined in <__availability>. +# In some cases we add also depend on macros defined in +# <__configuration/availability.h>. # libcxx_guard An optional string field. When this field is provided, # `test_suite_guard` must also be provided. This field is used # only to guard the feature-test macro in . It may @@ -1562,7 +1563,6 @@ def produce_version_header(): */ -#include <__availability> #include <__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) From 633ea41b54bf7b2f10850bbd5ba3c4ab06081595 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 28 May 2024 18:29:47 -0700 Subject: [PATCH 052/230] [runtimes] Reintroduce a way to select the compiler used for the test suite (#93542) A while back, the cxx_under_test Lit parameter was removed. This patch reintroduces a Lit parameter called "compiler" which controls the value of the %{cxx} substitution used in the test suite. To run the test suite with a different compiler, one can now pass --param compiler=. --- libcxx/test/CMakeLists.txt | 2 ++ libcxx/test/configs/cmake-bridge.cfg.in | 2 -- libcxx/utils/libcxx/test/params.py | 8 ++++++++ libcxxabi/test/CMakeLists.txt | 2 ++ libcxxabi/test/configs/cmake-bridge.cfg.in | 1 - libunwind/test/CMakeLists.txt | 2 ++ libunwind/test/configs/cmake-bridge.cfg.in | 2 -- 7 files changed, 14 insertions(+), 5 deletions(-) diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt index fd57aa9fe8b375..ee3502d32f7ae5 100644 --- a/libcxx/test/CMakeLists.txt +++ b/libcxx/test/CMakeLists.txt @@ -16,6 +16,8 @@ endif() set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!") set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n") +serialize_lit_string_param(SERIALIZED_LIT_PARAMS compiler "${CMAKE_CXX_COMPILER}") + if (NOT LIBCXX_ENABLE_EXCEPTIONS) serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False) endif() diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in index 84b3270a8940ac..78d0cb5a257488 100644 --- a/libcxx/test/configs/cmake-bridge.cfg.in +++ b/libcxx/test/configs/cmake-bridge.cfg.in @@ -23,8 +23,6 @@ config.recursiveExpansionLimit = 10 config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test') # Add substitutions for bootstrapping the test suite configuration -import shlex -config.substitutions.append(('%{cxx}', shlex.quote('@CMAKE_CXX_COMPILER@'))) config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@')) config.substitutions.append(('%{include-dir}', '@LIBCXX_GENERATED_INCLUDE_DIR@')) config.substitutions.append(('%{target-include-dir}', '@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@')) diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py index c2d294e49f4884..4c8590a2135d9e 100644 --- a/libcxx/utils/libcxx/test/params.py +++ b/libcxx/utils/libcxx/test/params.py @@ -143,6 +143,14 @@ def getSuitableClangTidy(cfg): # fmt: off DEFAULT_PARAMETERS = [ + Parameter( + name="compiler", + type=str, + help="The path of the compiler to use for testing.", + actions=lambda cxx: [ + AddSubstitution("%{cxx}", shlex.quote(cxx)), + ], + ), Parameter( name="target_triple", type=str, diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt index 586927189cf1dd..cd908a3514cb27 100644 --- a/libcxxabi/test/CMakeLists.txt +++ b/libcxxabi/test/CMakeLists.txt @@ -24,6 +24,8 @@ endif() set(AUTO_GEN_COMMENT "## Autogenerated by libcxxabi configuration.\n# Do not edit!") set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n") +serialize_lit_string_param(SERIALIZED_LIT_PARAMS compiler "${CMAKE_CXX_COMPILER}") + if (NOT LIBCXXABI_ENABLE_EXCEPTIONS) serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False) endif() diff --git a/libcxxabi/test/configs/cmake-bridge.cfg.in b/libcxxabi/test/configs/cmake-bridge.cfg.in index 1d0f51d37437bd..3fefc6a7fdc88a 100644 --- a/libcxxabi/test/configs/cmake-bridge.cfg.in +++ b/libcxxabi/test/configs/cmake-bridge.cfg.in @@ -26,7 +26,6 @@ config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test') # TODO: This is a non-standard Lit attribute and we should have another way of accessing this. config.host_triple = '@LLVM_HOST_TRIPLE@' -config.substitutions.append(('%{cxx}', '@CMAKE_CXX_COMPILER@')) config.substitutions.append(('%{libcxx}', '@LIBCXXABI_LIBCXX_PATH@')) config.substitutions.append(('%{include}', '@LIBCXXABI_SOURCE_DIR@/include')) config.substitutions.append(('%{cxx-include}', '@LIBCXXABI_HEADER_DIR@/include/c++/v1')) diff --git a/libunwind/test/CMakeLists.txt b/libunwind/test/CMakeLists.txt index 21dfbb0a84f0a8..bd2e575f2a296a 100644 --- a/libunwind/test/CMakeLists.txt +++ b/libunwind/test/CMakeLists.txt @@ -15,6 +15,8 @@ pythonize_bool(LIBUNWIND_USES_ARM_EHABI) set(AUTO_GEN_COMMENT "## Autogenerated by libunwind configuration.\n# Do not edit!") set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n") +serialize_lit_string_param(SERIALIZED_LIT_PARAMS compiler "${CMAKE_CXX_COMPILER}") + if (LIBUNWIND_EXECUTOR) message(DEPRECATION "LIBUNWIND_EXECUTOR is deprecated, please add executor=... to LIBUNWIND_TEST_PARAMS") serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBUNWIND_EXECUTOR}") diff --git a/libunwind/test/configs/cmake-bridge.cfg.in b/libunwind/test/configs/cmake-bridge.cfg.in index c5f34c87abb92a..7fc7a3da424629 100644 --- a/libunwind/test/configs/cmake-bridge.cfg.in +++ b/libunwind/test/configs/cmake-bridge.cfg.in @@ -29,7 +29,5 @@ if not @LIBUNWIND_ENABLE_THREADS@: config.available_features.add('libunwind-no-threads') # Add substitutions for bootstrapping the test suite configuration -import shlex -config.substitutions.append(('%{cxx}', shlex.quote('@CMAKE_CXX_COMPILER@'))) config.substitutions.append(('%{include}', '@LIBUNWIND_SOURCE_DIR@/include')) config.substitutions.append(('%{lib}', '@LIBUNWIND_LIBRARY_DIR@')) From bd135c3b9fb57e6346e4a790945809617388ca9b Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 28 May 2024 18:31:01 -0700 Subject: [PATCH 053/230] [runtimes][CMake] Simplify the propagation of test dependencies (#93558) Instead of using FOO_TEST_DEPS global variables that don't get updated properly from subdirectories, use targets to propagate the dependencies across directories. --- libcxx/CMakeLists.txt | 7 +++---- libcxx/benchmarks/CMakeLists.txt | 6 +----- libcxx/modules/CMakeLists.txt | 1 + libcxx/src/CMakeLists.txt | 2 ++ libcxx/test/CMakeLists.txt | 14 -------------- libcxx/test/tools/clang_tidy_checks/CMakeLists.txt | 2 ++ libcxxabi/CMakeLists.txt | 3 +++ libcxxabi/src/CMakeLists.txt | 1 + libcxxabi/test/CMakeLists.txt | 13 +------------ libunwind/test/CMakeLists.txt | 2 +- 10 files changed, 15 insertions(+), 36 deletions(-) diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index bbde9abc57919e..a061fda88b5c62 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -856,15 +856,14 @@ endfunction() #=============================================================================== # Setup Source Code And Tests #=============================================================================== +add_custom_target(cxx-test-depends + COMMENT "Build dependencies required to run the libc++ test suite.") + add_subdirectory(include) add_subdirectory(src) add_subdirectory(utils) add_subdirectory(modules) -set(LIBCXX_TEST_DEPS "cxx_experimental") - -list(APPEND LIBCXX_TEST_DEPS generate-cxx-modules) - if (LIBCXX_INCLUDE_BENCHMARKS) add_subdirectory(benchmarks) endif() diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt index 93b549a316e385..2101f9c71788c1 100644 --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -252,10 +252,6 @@ endforeach() if (LIBCXX_INCLUDE_TESTS) include(AddLLVM) - if (NOT DEFINED LIBCXX_TEST_DEPS) - message(FATAL_ERROR "Expected LIBCXX_TEST_DEPS to be defined") - endif() - configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py) @@ -265,6 +261,6 @@ if (LIBCXX_INCLUDE_TESTS) add_lit_target(check-cxx-benchmarks "Running libcxx benchmarks tests" ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS cxx-benchmarks ${LIBCXX_TEST_DEPS} + DEPENDS cxx-benchmarks cxx-test-depends ARGS ${BENCHMARK_LIT_ARGS}) endif() diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt index d47d19a4755317..82cd7b66beb7a9 100644 --- a/libcxx/modules/CMakeLists.txt +++ b/libcxx/modules/CMakeLists.txt @@ -202,6 +202,7 @@ add_custom_target(generate-cxx-modules ALL DEPENDS ${_all_modules} ) +add_dependencies(cxx-test-depends generate-cxx-modules) # Configure the modules manifest. # Use the relative path between the installation and the module in the json diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index 8b28d1b8918955..65e6ce2c4da43a 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -322,6 +322,7 @@ endif() # Add a meta-target for both libraries. add_custom_target(cxx DEPENDS ${LIBCXX_BUILD_TARGETS}) +add_dependencies(cxx-test-depends cxx) set(LIBCXX_EXPERIMENTAL_SOURCES experimental/keep.cpp @@ -366,6 +367,7 @@ set_target_properties(cxx_experimental ) cxx_add_common_build_flags(cxx_experimental) target_compile_options(cxx_experimental PUBLIC -D_LIBCPP_ENABLE_EXPERIMENTAL) +add_dependencies(cxx-test-depends cxx_experimental) if (LIBCXX_INSTALL_SHARED_LIBRARY) install(TARGETS cxx_shared diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt index ee3502d32f7ae5..3c54a4edccff38 100644 --- a/libcxx/test/CMakeLists.txt +++ b/libcxx/test/CMakeLists.txt @@ -1,11 +1,5 @@ include(HandleLitArguments) add_subdirectory(tools) -# When the tools add clang-tidy support, the dependencies need to be updated. -# This cannot be done in the tools CMakeLists.txt since that does not update -# the status in this (a parent) directory. -if(TARGET cxx-tidy) - list(APPEND LIBCXX_TEST_DEPS cxx-tidy) -endif() # By default, libcxx and libcxxabi share a library directory. if (NOT LIBCXX_CXX_ABI_LIBRARY_PATH) @@ -40,10 +34,6 @@ endif() serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS) -if (NOT DEFINED LIBCXX_TEST_DEPS) - message(FATAL_ERROR "Expected LIBCXX_TEST_DEPS to be defined") -endif() - if (MSVC) # Shared code for initializing some parameters used by all # llvm-libc++-*-clangcl.cfg.in test configs. @@ -81,10 +71,6 @@ if (LIBCXX_INCLUDE_TESTS) ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg MAIN_CONFIG "${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py") - add_custom_target(cxx-test-depends - DEPENDS cxx ${LIBCXX_TEST_DEPS} - COMMENT "Builds dependencies required to run the test suite.") - add_lit_testsuite(check-cxx "Running libcxx tests" ${CMAKE_CURRENT_BINARY_DIR} diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt index 28c1dbf8aca3c1..f0289dc44c6625 100644 --- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt +++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt @@ -110,3 +110,5 @@ set_target_properties(cxx-tidy PROPERTIES set_target_properties(cxx-tidy PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) set(CMAKE_SHARED_MODULE_SUFFIX_CXX .plugin) # Use a portable suffix to simplify how we can find it from Lit + +add_dependencies(cxx-test-depends cxx-tidy) diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt index f7673da25d20e0..86fe4a604f30d9 100644 --- a/libcxxabi/CMakeLists.txt +++ b/libcxxabi/CMakeLists.txt @@ -443,6 +443,9 @@ if (NOT "${LIBCXXABI_LIBUNWIND_INCLUDES_INTERNAL}" STREQUAL "") include_directories("${LIBCXXABI_LIBUNWIND_INCLUDES_INTERNAL}") endif() +add_custom_target(cxxabi-test-depends + COMMENT "Build dependencies required to run the libc++abi test suite.") + # Add source code. This also contains all of the logic for deciding linker flags # soname, etc... add_subdirectory(include) diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt index c8cc93de50777b..c54ced4dc3ea86 100644 --- a/libcxxabi/src/CMakeLists.txt +++ b/libcxxabi/src/CMakeLists.txt @@ -304,6 +304,7 @@ endif() # Add a meta-target for both libraries. add_custom_target(cxxabi DEPENDS ${LIBCXXABI_BUILD_TARGETS}) +add_dependencies(cxxabi-test-depends cxxabi cxx) if (LIBCXXABI_INSTALL_LIBRARY) install(TARGETS ${LIBCXXABI_INSTALL_TARGETS} diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt index cd908a3514cb27..8e3048f2ffe8a1 100644 --- a/libcxxabi/test/CMakeLists.txt +++ b/libcxxabi/test/CMakeLists.txt @@ -10,17 +10,6 @@ endmacro() pythonize_bool(LIBCXXABI_USE_LLVM_UNWINDER) -if (LIBCXXABI_ENABLE_SHARED) - set(LIBCXXABI_TEST_DEPS cxxabi_shared) -else() - set(LIBCXXABI_TEST_DEPS cxxabi_static) -endif() - -list(APPEND LIBCXXABI_TEST_DEPS cxx) -if (LIBCXXABI_USE_LLVM_UNWINDER AND TARGET unwind) - list(APPEND LIBCXXABI_TEST_DEPS unwind) -endif() - set(AUTO_GEN_COMMENT "## Autogenerated by libcxxabi configuration.\n# Do not edit!") set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n") @@ -59,4 +48,4 @@ configure_lit_site_cfg( add_lit_testsuite(check-cxxabi "Running libcxxabi tests" ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${LIBCXXABI_TEST_DEPS}) + DEPENDS cxxabi-test-depends) diff --git a/libunwind/test/CMakeLists.txt b/libunwind/test/CMakeLists.txt index bd2e575f2a296a..19f055f6f93ffc 100644 --- a/libunwind/test/CMakeLists.txt +++ b/libunwind/test/CMakeLists.txt @@ -47,4 +47,4 @@ configure_lit_site_cfg( add_lit_testsuite(check-unwind "Running libunwind tests" ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS unwind ${LIBUNWIND_TEST_DEPS}) + DEPENDS unwind) From 7832769d329ead264aff238c06dce086b3a74922 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 28 May 2024 19:46:23 -0600 Subject: [PATCH 054/230] Revert "[lld] Support thumb PLTs" (#93631) Reverts llvm/llvm-project#86223 windows pre-merge is broken. --- lld/ELF/Arch/ARM.cpp | 176 ++++++++------------------- lld/ELF/Config.h | 1 - lld/ELF/InputFiles.cpp | 12 -- lld/test/ELF/armv8-thumb-plt-reloc.s | 126 ------------------- 4 files changed, 53 insertions(+), 262 deletions(-) delete mode 100644 lld/test/ELF/armv8-thumb-plt-reloc.s diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 3e0efe540e1bf1..687f9499009d5e 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -231,71 +231,36 @@ static void writePltHeaderLong(uint8_t *buf) { // The default PLT header requires the .got.plt to be within 128 Mb of the // .plt in the positive direction. void ARM::writePltHeader(uint8_t *buf) const { - if (config->armThumbPLTs) { - // The instruction sequence for thumb: - // - // 0: b500 push {lr} - // 2: f8df e008 ldr.w lr, [pc, #0x8] @ 0xe - // 6: 44fe add lr, pc - // 8: f85e ff08 ldr pc, [lr, #8]! - // e: .word .got.plt - .plt - 16 - // - // At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from - // `pc` in the add instruction and 8 bytes for the `lr` adjustment. - // - uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16; - assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset"); - write16(buf + 0, 0xb500); - // Split into two halves to support endianness correctly. - write16(buf + 2, 0xf8df); - write16(buf + 4, 0xe008); - write16(buf + 6, 0x44fe); - // Split into two halves to support endianness correctly. - write16(buf + 8, 0xf85e); - write16(buf + 10, 0xff08); - write32(buf + 12, offset); - - memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary - memcpy(buf + 20, trapInstr.data(), 4); - memcpy(buf + 24, trapInstr.data(), 4); - memcpy(buf + 28, trapInstr.data(), 4); - } else { - // Use a similar sequence to that in writePlt(), the difference is the - // calling conventions mean we use lr instead of ip. The PLT entry is - // responsible for saving lr on the stack, the dynamic loader is responsible - // for reloading it. - const uint32_t pltData[] = { - 0xe52de004, // L1: str lr, [sp,#-4]! - 0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4) - 0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4) - 0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4) - }; - - uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4; - if (!llvm::isUInt<27>(offset)) { - // We cannot encode the Offset, use the long form. - writePltHeaderLong(buf); - return; - } - write32(buf + 0, pltData[0]); - write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff)); - write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff)); - write32(buf + 12, pltData[3] | (offset & 0xfff)); - memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary - memcpy(buf + 20, trapInstr.data(), 4); - memcpy(buf + 24, trapInstr.data(), 4); - memcpy(buf + 28, trapInstr.data(), 4); + // Use a similar sequence to that in writePlt(), the difference is the calling + // conventions mean we use lr instead of ip. The PLT entry is responsible for + // saving lr on the stack, the dynamic loader is responsible for reloading + // it. + const uint32_t pltData[] = { + 0xe52de004, // L1: str lr, [sp,#-4]! + 0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4) + 0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4) + 0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4) + }; + + uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4; + if (!llvm::isUInt<27>(offset)) { + // We cannot encode the Offset, use the long form. + writePltHeaderLong(buf); + return; } + write32(buf + 0, pltData[0]); + write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff)); + write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff)); + write32(buf + 12, pltData[3] | (offset & 0xfff)); + memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary + memcpy(buf + 20, trapInstr.data(), 4); + memcpy(buf + 24, trapInstr.data(), 4); + memcpy(buf + 28, trapInstr.data(), 4); } void ARM::addPltHeaderSymbols(InputSection &isec) const { - if (config->armThumbPLTs) { - addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec); - addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec); - } else { - addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec); - addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec); - } + addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec); + addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec); } // Long form PLT entries that do not have any restrictions on the displacement @@ -314,65 +279,32 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr, // .plt in the positive direction. void ARM::writePlt(uint8_t *buf, const Symbol &sym, uint64_t pltEntryAddr) const { + // The PLT entry is similar to the example given in Appendix A of ELF for + // the Arm Architecture. Instead of using the Group Relocations to find the + // optimal rotation for the 8-bit immediate used in the add instructions we + // hard code the most compact rotations for simplicity. This saves a load + // instruction over the long plt sequences. + const uint32_t pltData[] = { + 0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8 + 0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8 + 0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8 + }; - if (!config->armThumbPLTs) { - uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8; - - // The PLT entry is similar to the example given in Appendix A of ELF for - // the Arm Architecture. Instead of using the Group Relocations to find the - // optimal rotation for the 8-bit immediate used in the add instructions we - // hard code the most compact rotations for simplicity. This saves a load - // instruction over the long plt sequences. - const uint32_t pltData[] = { - 0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8 - 0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8 - 0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8 - }; - if (!llvm::isUInt<27>(offset)) { - // We cannot encode the Offset, use the long form. - writePltLong(buf, sym.getGotPltVA(), pltEntryAddr); - return; - } - write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff)); - write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff)); - write32(buf + 8, pltData[2] | (offset & 0xfff)); - memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary - } else { - uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12; - assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset"); - - // A PLT entry will be: - // - // movw ip, # - // movt ip, # - // add ip, pc - // L1: ldr.w pc, [ip] - // b L1 - // - // where ip = r12 = 0xc - - // movw ip, # - write16(buf + 2, 0x0c00); // use `ip` - relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, offset); - - // movt ip, # - write16(buf + 6, 0x0c00); // use `ip` - relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, offset); - - write16(buf + 8, 0x44fc); // add ip, pc - write16(buf + 10, 0xf8dc); // ldr.w pc, [ip] (bottom half) - write16(buf + 12, 0xf000); // ldr.w pc, [ip] (upper half) - write16(buf + 14, 0xe7fc); // Branch to previous instruction + uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8; + if (!llvm::isUInt<27>(offset)) { + // We cannot encode the Offset, use the long form. + writePltLong(buf, sym.getGotPltVA(), pltEntryAddr); + return; } + write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff)); + write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff)); + write32(buf + 8, pltData[2] | (offset & 0xfff)); + memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary } void ARM::addPltSymbols(InputSection &isec, uint64_t off) const { - if (config->armThumbPLTs) { - addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec); - } else { - addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec); - addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec); - } + addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec); + addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec); } bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, @@ -393,8 +325,6 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, case R_ARM_JUMP24: // Source is ARM, all PLT entries are ARM so no interworking required. // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb). - assert(!config->armThumbPLTs && - "If the source is ARM, we should not need Thumb PLTs"); if (s.isFunc() && expr == R_PC && (s.getVA() & 1)) return true; [[fallthrough]]; @@ -405,9 +335,9 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, } case R_ARM_THM_JUMP19: case R_ARM_THM_JUMP24: - // Source is Thumb, when all PLT entries are ARM interworking is required. + // Source is Thumb, all PLT entries are ARM so interworking is required. // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM). - if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0)) + if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0)) return true; [[fallthrough]]; case R_ARM_THM_CALL: { @@ -617,6 +547,7 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { // STT_FUNC we choose whether to write a BL or BLX depending on the // value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is // not of type STT_FUNC then we must preserve the original instruction. + // PLT entries are always ARM state so we know we don't need to interwork. assert(rel.sym); // R_ARM_CALL is always reached via relocate(). bool bit0Thumb = val & 1; bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000; @@ -675,13 +606,12 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { // PLT entries are always ARM state so we know we need to interwork. assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate(). bool bit0Thumb = val & 1; - bool useThumb = bit0Thumb || config->armThumbPLTs; bool isBlx = (read16(loc + 2) & 0x1000) == 0; // lld 10.0 and before always used bit0Thumb when deciding to write a BLX - // even when type not STT_FUNC. - if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb) + // even when type not STT_FUNC. PLT entries generated by LLD are always ARM. + if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb) stateChangeWarning(loc, rel.type, *rel.sym); - if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) { + if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) { // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As // the BLX instruction may only be two byte aligned. This must be done // before overflow check. diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 883c4a2f84294c..f0dfe7f377de0e 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -217,7 +217,6 @@ struct Config { bool allowMultipleDefinition; bool fatLTOObjects; bool androidPackDynRelocs = false; - bool armThumbPLTs = false; bool armHasBlx = false; bool armHasMovtMovw = false; bool armJ1J2BranchEncoding = false; diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index d760dddcf5ec5c..1f496026d3ae20 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -194,18 +194,6 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) { if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base && profile == ARMBuildAttrs::MicroControllerProfile) config->armCMSESupport = true; - - // The thumb PLT entries require Thumb2 which can be used on multiple archs. - // For now, let's limit it to ones where ARM isn't available and we know have - // Thumb2. - std::optional armISA = - attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use); - std::optional thumb = - attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use); - bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed; - bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32; - if (noArmISA && hasThumb2) - config->armThumbPLTs = true; } InputFile::InputFile(Kind k, MemoryBufferRef m) diff --git a/lld/test/ELF/armv8-thumb-plt-reloc.s b/lld/test/ELF/armv8-thumb-plt-reloc.s deleted file mode 100644 index 47cd5c1b741ee0..00000000000000 --- a/lld/test/ELF/armv8-thumb-plt-reloc.s +++ /dev/null @@ -1,126 +0,0 @@ -// REQUIRES: arm -// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1 -// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %s -o %t2 -// RUN: ld.lld %t1 %t2 -o %t -// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s -// RUN: ld.lld -shared %t1 %t2 -o %t.so -// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s -// RUN: llvm-readelf -S -r %t.so | FileCheck -check-prefix=DSOREL %s - -// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.be -// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %s -o %t2.be -// RUN: ld.lld %t1.be %t2.be -o %t.be -// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s -// RUN: ld.lld -shared %t1.be %t2.be -o %t.so.be -// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s -// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s - -// RUN: ld.lld --be8 %t1.be %t2.be -o %t.be -// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s -// RUN: ld.lld --be8 -shared %t1.be %t2.be -o %t.so.be -// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s -// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s - -/// Test PLT entry generation - .text - .align 2 - .globl _start - .type _start,%function -_start: - bl func1 - bl func2 - bl func3 - b.w func1 - b.w func2 - b.w func3 - beq.w func1 - beq.w func2 - beq.w func3 - -/// Executable, expect no PLT -// CHECK: Disassembly of section .text: -// CHECK-EMPTY: -// CHECK-NEXT: : -// CHECK-NEXT: bx lr -// CHECK: : -// CHECK-NEXT: bx lr -// CHECK: : -// CHECK-NEXT: bx lr -// CHECK-NEXT: d4d4 -// CHECK: <_start>: -// CHECK-NEXT: bl {{.*}} -// CHECK-NEXT: bl {{.*}} -// CHECK-NEXT: bl {{.*}} -// CHECK-NEXT: b.w {{.*}} -// CHECK-NEXT: b.w {{.*}} -// CHECK-NEXT: b.w {{.*}} -// CHECK-NEXT: beq.w {{.*}} -// CHECK-NEXT: beq.w {{.*}} -// CHECK-NEXT: beq.w {{.*}} - -// DSO: Disassembly of section .text: -// DSO-EMPTY: -// DSO-NEXT: : -// DSO-NEXT: bx lr -// DSO: : -// DSO-NEXT: bx lr -// DSO: : -// DSO-NEXT: bx lr -// DSO-NEXT: d4d4 -// DSO: <_start>: -/// 0x10260 = PLT func1 -// DSO-NEXT: bl 0x10260 -/// 0x10270 = PLT func2 -// DSO-NEXT: bl 0x10270 -/// 0x10280 = PLT func3 -// DSO-NEXT: bl 0x10280 -/// 0x10260 = PLT func1 -// DSO-NEXT: b.w 0x10260 -/// 0x10270 = PLT func2 -// DSO-NEXT: b.w 0x10270 -/// 0x10280 = PLT func3 -// DSO-NEXT: b.w 0x10280 -/// 0x10260 = PLT func1 -// DSO-NEXT: beq.w 0x10260 -/// 0x10270 = PLT func2 -// DSO-NEXT: beq.w 0x10270 -/// 0x10280 = PLT func3 -// DSO-NEXT: beq.w 0x10280 -// DSO: Disassembly of section .plt: -// DSO-EMPTY: -// DSO-NEXT: 10240 <.plt>: -// DSO-NEXT: push {lr} -// DSO-NEXT: ldr.w lr, [pc, #8] -// DSO-NEXT: add lr, pc -// DSO-NEXT: ldr pc, [lr, #8]! -/// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8 -// DSO-NEXT: .word 0x00020098 -// DSO-NEXT: .word 0xd4d4d4d4 -// DSO-NEXT: .word 0xd4d4d4d4 -// DSO-NEXT: .word 0xd4d4d4d4 -// DSO-NEXT: .word 0xd4d4d4d4 - -/// 136 + 2 << 16 + 0x1026c = 0x302f4 = got entry 1 -// DSO-NEXT: 10260: f240 0c88 movw r12, #136 -// DSO-NEXT: f2c0 0c02 movt r12, #2 -// DSO-NEXT: 44fc add r12, pc -// DSO-NEXT: f8dc f000 ldr.w pc, [r12] -// DSO-NEXT: e7fc b 0x1026a -/// 124 + 2 << 16 + 0x1027c = 0x302f8 = got entry 2 -// DSO-NEXT: 10270: f240 0c7c movw r12, #124 -// DSO-NEXT: f2c0 0c02 movt r12, #2 -// DSO-NEXT: 44fc add r12, pc -// DSO-NEXT: f8dc f000 ldr.w pc, [r12] -// DSO-NEXT: e7fc b 0x1027a -/// 112 + 2 << 16 + 0x1028c = 0x302fc = got entry 3 -// DSO-NEXT: 10280: f240 0c70 movw r12, #112 -// DSO-NEXT: f2c0 0c02 movt r12, #2 -// DSO-NEXT: 44fc add r12, pc -// DSO-NEXT: f8dc f000 ldr.w pc, [r12] -// DSO-NEXT: e7fc b 0x1028a - -// DSOREL: .got.plt PROGBITS 000302e8 {{.*}} 000018 00 WA 0 0 4 -// DSOREL: Relocation section '.rel.plt' -// DSOREL: 000302f4 {{.*}} R_ARM_JUMP_SLOT {{.*}} func1 -// DSOREL: 000302f8 {{.*}} R_ARM_JUMP_SLOT {{.*}} func2 -// DSOREL: 000302fc {{.*}} R_ARM_JUMP_SLOT {{.*}} func3 From c250aeb9d6c590d9fdbebd84fc259c4e536dace9 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Mon, 27 May 2024 19:33:53 +0900 Subject: [PATCH 055/230] [AMDGPU] Fix typo in VIMAGE no sampler opcode usage (NFCI) Opcodes are the same for these instructions in GFX11 and 12, hence this typo has no functional impact. --- llvm/lib/Target/AMDGPU/MIMGInstructions.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 351263d079768b..24f9a6e375baaf 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -494,7 +494,7 @@ class MIMG_NoSampler_nsa_gfx11 - : VIMAGE_gfx12 { + : VIMAGE_gfx12 { let InOperandList = !con(AddrIns, (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe), From cbf6e93ceee7b9de2b7c3e7e8cea3a972eda0e75 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Tue, 28 May 2024 20:47:49 -0700 Subject: [PATCH 056/230] [clang codegen] Delete unnecessary GEP cleanup code. (#90303) There's some code in AggExprEmitter::VisitCXXParenListOrInitListExpr to try to do early cleanup for GEPs for fields that aren't accessed. But it's unlikely to actually save significant compile-time, and it's subtly wrong in cases where EmitLValueForFieldInitialization() doesn't create a GEP. So just delete the code. Fixes #88077. Fixes #89547. --- clang/lib/CodeGen/CGExprAgg.cpp | 10 --------- clang/test/CodeGenCXX/no-unique-address.cpp | 25 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp index bba00257fd4f0a..7a92fc3dfb4a43 100644 --- a/clang/lib/CodeGen/CGExprAgg.cpp +++ b/clang/lib/CodeGen/CGExprAgg.cpp @@ -1789,7 +1789,6 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr( // Push a destructor if necessary. // FIXME: if we have an array of structures, all explicitly // initialized, we can end up pushing a linear number of cleanups. - bool pushedCleanup = false; if (QualType::DestructionKind dtorKind = field->getType().isDestructedType()) { assert(LV.isSimple()); @@ -1797,17 +1796,8 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr( CGF.pushDestroyAndDeferDeactivation(NormalAndEHCleanup, LV.getAddress(), field->getType(), CGF.getDestroyer(dtorKind), false); - pushedCleanup = true; } } - - // If the GEP didn't get used because of a dead zero init or something - // else, clean it up for -O0 builds and general tidiness. - if (!pushedCleanup && LV.isSimple()) - if (llvm::GetElementPtrInst *GEP = - dyn_cast(LV.emitRawPointer(CGF))) - if (GEP->use_empty()) - GEP->eraseFromParent(); } } diff --git a/clang/test/CodeGenCXX/no-unique-address.cpp b/clang/test/CodeGenCXX/no-unique-address.cpp index 7b4bbbf2a05d51..82532c5e1be82a 100644 --- a/clang/test/CodeGenCXX/no-unique-address.cpp +++ b/clang/test/CodeGenCXX/no-unique-address.cpp @@ -101,3 +101,28 @@ struct HasZeroSizedFieldWithNonTrivialInit { HasZeroSizedFieldWithNonTrivialInit testHasZeroSizedFieldWithNonTrivialInit = {.a = 1}; // CHECK-LABEL: define {{.*}}cxx_global_var_init // CHECK: call {{.*}}@_ZN14NonTrivialInitC1Ev({{.*}}@testHasZeroSizedFieldWithNonTrivialInit + +void *operator new(unsigned long, void *); +template +struct _box { + [[no_unique_address]] Ty _value; +}; +// Make sure this doesn't crash. +// CHECK-LABEL: define {{.*}}placement_new_struct +void placement_new_struct() { + struct set_value_t {}; + + // GH88077 + struct _tuple : _box, _box {}; + + int _storage[1]; + new (_storage) _tuple{}; + + // GH89547 + struct _tuple2 { + _box a; + }; + + int _storage2[1]; + new (_storage2) _tuple2{}; +} From bb42511f64fd44f2ff1beb0dd38a653a8f2c20df Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Wed, 29 May 2024 12:58:44 +0800 Subject: [PATCH 057/230] [Clang][Sema] Use StructuralValues to model dependent NTTP arguments (#93556) This patch takes Richard's approach of no longer modeling dependent NTTP arguments with TemplateParamObjectDecls. Clang used to do so, which left behind a problem in that we might mess up dependent and non-dependent arguments that boil down to the same canonical type because there's a default argument on the NTTP. The problem of "canonical expression" is still present because this patch doesn't touch the profiling part. Namely, #92292 seems different. Fixes https://github.com/llvm/llvm-project/issues/84052 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/AST/TemplateBase.cpp | 7 +++++- .../SemaTemplate/temp_arg_nontype_cxx2c.cpp | 23 ++++++++++++++++++- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 9091f6341bd9b8..bd92818f0c09d0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -810,6 +810,7 @@ Bug Fixes to C++ Support - Clang now diagnoses unexpanded parameter packs in attributes. (Fixes #GH93269). - Clang now allows ``@$``` in raw string literals. Fixes (#GH93130). - Fix an assertion failure when checking invalid ``this`` usage in the wrong context. (Fixes #GH91536). +- Clang no longer models dependent NTTP arguments as ``TemplateParamObjectDecl`` s. Fixes (#GH84052). Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp index a7ee973b7f7d06..b50daf5fbed6a7 100644 --- a/clang/lib/AST/TemplateBase.cpp +++ b/clang/lib/AST/TemplateBase.cpp @@ -221,8 +221,13 @@ static const ValueDecl *getAsSimpleValueDeclRef(const ASTContext &Ctx, // We model class non-type template parameters as their template parameter // object declaration. - if (V.isStruct() || V.isUnion()) + if (V.isStruct() || V.isUnion()) { + // Dependent types are not supposed to be described as + // TemplateParamObjectDecls. + if (T->isDependentType() || T->isInstantiationDependentType()) + return nullptr; return Ctx.getTemplateParamObjectDecl(T, V); + } // Pointers and references with an empty path use the special 'Declaration' // representation. diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp index 9fb6b440b6b2af..e74c031eba4c1c 100644 --- a/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp +++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -std=c++20 -Wconversion -verify %s +// RUN: %clang_cc1 -fsyntax-only -std=c++2c -Wconversion -verify %s struct Test { int a = 0; @@ -102,3 +102,24 @@ void bar() { } } + +namespace GH84052 { + +template +concept C = sizeof(T...[1]) == 1; // #C + +struct A {}; + +template auto = A{}> struct Set {}; // #Set + +template void foo() { + Set unrelated; +} + +Set sb; +Set sf; +// expected-error@-1 {{constraints not satisfied for class template 'Set'}} +// expected-note@#Set {{because 'C' evaluated to false}} +// expected-note@#C {{evaluated to false}} + +} // namespace GH84052 From 465bc5e729fd755880b9a288de42a37ad1206301 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 29 May 2024 07:05:55 +0200 Subject: [PATCH 058/230] AArch64/ARM/PPC/X86: Add some atomic tests (#92933) FP typed atomic load/store coverage was mostly missing, especially for half and bfloat. --- .../Atomics/aarch64-atomic-load-lse2.ll | 113 ++++ .../CodeGen/AArch64/relaxed-fp-atomics.ll | 90 +++ llvm/test/CodeGen/ARM/atomic-load-store.ll | 536 ++++++++++++++++++ llvm/test/CodeGen/PowerPC/atomics.ll | 209 +++++++ llvm/test/CodeGen/X86/atomic-non-integer.ll | 97 ++++ 5 files changed, 1045 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll index e7e231bc344d92..3732d4feb0c67b 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll @@ -566,6 +566,119 @@ define dso_local i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %pt %r = load atomic i128, ptr %ptr seq_cst, align 1 ret i128 %r } + +define dso_local half @load_atomic_f16_aligned_unordered(ptr %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_unordered: +; CHECK: ldrh w8, [x0] + %r = load atomic half, ptr %ptr unordered, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_unordered_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_unordered_const: +; CHECK: ldrh w8, [x0] + %r = load atomic half, ptr %ptr unordered, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_monotonic(ptr %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_monotonic: +; CHECK: ldrh w8, [x0] + %r = load atomic half, ptr %ptr monotonic, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_monotonic_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_monotonic_const: +; CHECK: ldrh w8, [x0] + %r = load atomic half, ptr %ptr monotonic, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_acquire(ptr %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_acquire: +; CHECK: ldarh w8, [x0] + %r = load atomic half, ptr %ptr acquire, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_acquire_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_acquire_const: +; CHECK: ldarh w8, [x0] + %r = load atomic half, ptr %ptr acquire, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_seq_cst(ptr %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_seq_cst: +; CHECK: ldarh w8, [x0] + %r = load atomic half, ptr %ptr seq_cst, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_seq_cst_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_seq_cst_const: +; CHECK: ldarh w8, [x0] + %r = load atomic half, ptr %ptr seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_unordered(ptr %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_unordered: +; CHECK: ldrh w8, [x0] + %r = load atomic bfloat, ptr %ptr unordered, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_unordered_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_unordered_const: +; CHECK: ldrh w8, [x0] + %r = load atomic bfloat, ptr %ptr unordered, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_monotonic(ptr %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_monotonic: +; CHECK: ldrh w8, [x0] + %r = load atomic bfloat, ptr %ptr monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_monotonic_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_monotonic_const: +; CHECK: ldrh w8, [x0] + %r = load atomic bfloat, ptr %ptr monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_acquire(ptr %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_acquire: +; CHECK: ldarh w8, [x0] + %r = load atomic bfloat, ptr %ptr acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_acquire_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_acquire_const: +; CHECK: ldarh w8, [x0] + %r = load atomic bfloat, ptr %ptr acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_seq_cst(ptr %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst: +; CHECK: ldarh w8, [x0] + %r = load atomic bfloat, ptr %ptr seq_cst, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_seq_cst_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst_const: +; CHECK: ldarh w8, [x0] + %r = load atomic bfloat, ptr %ptr seq_cst, align 2 + ret bfloat %r +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; -O0: {{.*}} ; -O1: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll b/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll index 95abbb6979be89..af664549a472a9 100644 --- a/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll +++ b/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll @@ -91,4 +91,94 @@ define void @atomic_store_relaxed_f64(ptr %p, i32 %off32, i64 %off64, double %va ret void } +define half @atomic_load_relaxed_f16(ptr %p, i32 %off32, i64 %off64) #0 { +; CHECK-LABEL: atomic_load_relaxed_f16: + %ptr_unsigned = getelementptr half, ptr %p, i32 4095 + %val_unsigned = load atomic half, ptr %ptr_unsigned monotonic, align 4 +; CHECK: ldrh {{w[0-9]+}}, [x0, #8190] + + %ptr_regoff = getelementptr half, ptr %p, i32 %off32 + %val_regoff = load atomic half, ptr %ptr_regoff unordered, align 4 + %tot1 = fadd half %val_unsigned, %val_regoff +; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1] + + %ptr_regoff64 = getelementptr half, ptr %p, i64 %off64 + %val_regoff64 = load atomic half, ptr %ptr_regoff64 monotonic, align 4 + %tot2 = fadd half %tot1, %val_regoff64 +; CHECK: ldrh {{w[0-9]+}}, [x0, x2, lsl #1] + + %ptr_unscaled = getelementptr half, ptr %p, i32 -64 + %val_unscaled = load atomic half, ptr %ptr_unscaled unordered, align 4 + %tot3 = fadd half %tot2, %val_unscaled +; CHECK: ldurh {{w[0-9]+}}, [x0, #-128] + + ret half %tot3 +} + +define bfloat @atomic_load_relaxed_bf16(ptr %p, i32 %off32, i64 %off64) #0 { +; CHECK-LABEL: atomic_load_relaxed_bf16: + %ptr_unsigned = getelementptr bfloat, ptr %p, i32 4095 + %val_unsigned = load atomic bfloat, ptr %ptr_unsigned monotonic, align 4 +; CHECK: ldrh {{w[0-9]+}}, [x0, #8190] + + %ptr_regoff = getelementptr bfloat, ptr %p, i32 %off32 + %val_regoff = load atomic bfloat, ptr %ptr_regoff unordered, align 4 + %tot1 = fadd bfloat %val_unsigned, %val_regoff +; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1] + + %ptr_regoff64 = getelementptr bfloat, ptr %p, i64 %off64 + %val_regoff64 = load atomic bfloat, ptr %ptr_regoff64 monotonic, align 4 + %tot2 = fadd bfloat %tot1, %val_regoff64 +; CHECK: ldrh {{w[0-9]+}}, [x0, x2, lsl #1] + + %ptr_unscaled = getelementptr bfloat, ptr %p, i32 -64 + %val_unscaled = load atomic bfloat, ptr %ptr_unscaled unordered, align 4 + %tot3 = fadd bfloat %tot2, %val_unscaled +; CHECK: ldurh {{w[0-9]+}}, [x0, #-128] + + ret bfloat %tot3 +} + +define void @atomic_store_relaxed_f16(ptr %p, i32 %off32, i64 %off64, half %val) #0 { +; CHECK-LABEL: atomic_store_relaxed_f16: + %ptr_unsigned = getelementptr half, ptr %p, i32 4095 + store atomic half %val, ptr %ptr_unsigned monotonic, align 4 +; CHECK: strh {{w[0-9]+}}, [x0, #8190] + + %ptr_regoff = getelementptr half, ptr %p, i32 %off32 + store atomic half %val, ptr %ptr_regoff unordered, align 4 +; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1] + + %ptr_regoff64 = getelementptr half, ptr %p, i64 %off64 + store atomic half %val, ptr %ptr_regoff64 monotonic, align 4 +; CHECK: strh {{w[0-9]+}}, [x0, x2, lsl #1] + + %ptr_unscaled = getelementptr half, ptr %p, i32 -64 + store atomic half %val, ptr %ptr_unscaled unordered, align 4 +; CHECK: sturh {{w[0-9]+}}, [x0, #-128] + + ret void +} + +define void @atomic_store_relaxed_bf16(ptr %p, i32 %off32, i64 %off64, bfloat %val) #0 { +; CHECK-LABEL: atomic_store_relaxed_bf16: + %ptr_unsigned = getelementptr bfloat, ptr %p, i32 4095 + store atomic bfloat %val, ptr %ptr_unsigned monotonic, align 4 +; CHECK: strh {{w[0-9]+}}, [x0, #8190] + + %ptr_regoff = getelementptr bfloat, ptr %p, i32 %off32 + store atomic bfloat %val, ptr %ptr_regoff unordered, align 4 +; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1] + + %ptr_regoff64 = getelementptr bfloat, ptr %p, i64 %off64 + store atomic bfloat %val, ptr %ptr_regoff64 monotonic, align 4 +; CHECK: strh {{w[0-9]+}}, [x0, x2, lsl #1] + + %ptr_unscaled = getelementptr bfloat, ptr %p, i32 -64 + store atomic bfloat %val, ptr %ptr_unscaled unordered, align 4 +; CHECK: sturh {{w[0-9]+}}, [x0, #-128] + + ret void +} + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll index 4f2e63b5f24676..c53fb2f330a792 100644 --- a/llvm/test/CodeGen/ARM/atomic-load-store.ll +++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll @@ -439,3 +439,539 @@ define void @test_old_store_64bit(ptr %p, i64 %v) { store atomic i64 %v, ptr %p seq_cst, align 8 ret void } + +define half @load_atomic_f16__seq_cst(ptr %ptr) { +; ARM-LABEL: load_atomic_f16__seq_cst: +; ARM: @ %bb.0: +; ARM-NEXT: ldrh r0, [r0] +; ARM-NEXT: dmb ish +; ARM-NEXT: bx lr +; +; ARMOPTNONE-LABEL: load_atomic_f16__seq_cst: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: ldrh r0, [r0] +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: bx lr +; +; THUMBTWO-LABEL: load_atomic_f16__seq_cst: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: ldrh r0, [r0] +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: load_atomic_f16__seq_cst: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: movs r1, #0 +; THUMBONE-NEXT: mov r2, r1 +; THUMBONE-NEXT: bl __sync_val_compare_and_swap_2 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: load_atomic_f16__seq_cst: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: mov r1, #5 +; ARMV4-NEXT: bl __atomic_load_2 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: load_atomic_f16__seq_cst: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: ldrh r0, [r0] +; ARMV6-NEXT: mov r1, #0 +; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5 +; ARMV6-NEXT: bx lr +; +; THUMBM-LABEL: load_atomic_f16__seq_cst: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: ldrh r0, [r0] +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: bx lr + %val = load atomic half, ptr %ptr seq_cst, align 2 + ret half %val +} + +define bfloat @load_atomic_bf16__seq_cst(ptr %ptr) { +; ARM-LABEL: load_atomic_bf16__seq_cst: +; ARM: @ %bb.0: +; ARM-NEXT: ldrh r0, [r0] +; ARM-NEXT: dmb ish +; ARM-NEXT: bx lr +; +; ARMOPTNONE-LABEL: load_atomic_bf16__seq_cst: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: ldrh r0, [r0] +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: bx lr +; +; THUMBTWO-LABEL: load_atomic_bf16__seq_cst: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: ldrh r0, [r0] +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: load_atomic_bf16__seq_cst: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: movs r1, #0 +; THUMBONE-NEXT: mov r2, r1 +; THUMBONE-NEXT: bl __sync_val_compare_and_swap_2 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: load_atomic_bf16__seq_cst: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: mov r1, #5 +; ARMV4-NEXT: bl __atomic_load_2 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: load_atomic_bf16__seq_cst: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: ldrh r0, [r0] +; ARMV6-NEXT: mov r1, #0 +; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5 +; ARMV6-NEXT: bx lr +; +; THUMBM-LABEL: load_atomic_bf16__seq_cst: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: ldrh r0, [r0] +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: bx lr + %val = load atomic bfloat, ptr %ptr seq_cst, align 2 + ret bfloat %val +} + +define float @load_atomic_f32__seq_cst(ptr %ptr) { +; ARM-LABEL: load_atomic_f32__seq_cst: +; ARM: @ %bb.0: +; ARM-NEXT: ldr r0, [r0] +; ARM-NEXT: dmb ish +; ARM-NEXT: bx lr +; +; ARMOPTNONE-LABEL: load_atomic_f32__seq_cst: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: ldr r0, [r0] +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: vmov s0, r0 +; ARMOPTNONE-NEXT: bx lr +; +; THUMBTWO-LABEL: load_atomic_f32__seq_cst: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: ldr r0, [r0] +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: load_atomic_f32__seq_cst: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: movs r1, #0 +; THUMBONE-NEXT: mov r2, r1 +; THUMBONE-NEXT: bl __sync_val_compare_and_swap_4 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: load_atomic_f32__seq_cst: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: mov r1, #5 +; ARMV4-NEXT: bl __atomic_load_4 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: load_atomic_f32__seq_cst: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: ldr r0, [r0] +; ARMV6-NEXT: mov r1, #0 +; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5 +; ARMV6-NEXT: bx lr +; +; THUMBM-LABEL: load_atomic_f32__seq_cst: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: ldr r0, [r0] +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: bx lr + %val = load atomic float, ptr %ptr seq_cst, align 4 + ret float %val +} + +define double @load_atomic_f64__seq_cst(ptr %ptr) { +; ARM-LABEL: load_atomic_f64__seq_cst: +; ARM: @ %bb.0: +; ARM-NEXT: ldrexd r0, r1, [r0] +; ARM-NEXT: clrex +; ARM-NEXT: dmb ish +; ARM-NEXT: bx lr +; +; ARMOPTNONE-LABEL: load_atomic_f64__seq_cst: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: ldrexd r2, r3, [r0] +; ARMOPTNONE-NEXT: mov r1, r3 +; ARMOPTNONE-NEXT: mov r0, r2 +; ARMOPTNONE-NEXT: clrex +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: vmov d16, r0, r1 +; ARMOPTNONE-NEXT: bx lr +; +; THUMBTWO-LABEL: load_atomic_f64__seq_cst: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: ldrexd r0, r1, [r0] +; THUMBTWO-NEXT: clrex +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: load_atomic_f64__seq_cst: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: sub sp, #8 +; THUMBONE-NEXT: movs r2, #0 +; THUMBONE-NEXT: str r2, [sp] +; THUMBONE-NEXT: str r2, [sp, #4] +; THUMBONE-NEXT: mov r3, r2 +; THUMBONE-NEXT: bl __sync_val_compare_and_swap_8 +; THUMBONE-NEXT: add sp, #8 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: load_atomic_f64__seq_cst: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: mov r1, #5 +; ARMV4-NEXT: bl __atomic_load_8 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: load_atomic_f64__seq_cst: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: ldrexd r0, r1, [r0] +; ARMV6-NEXT: mov r2, #0 +; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5 +; ARMV6-NEXT: bx lr +; +; THUMBM-LABEL: load_atomic_f64__seq_cst: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: push {r7, lr} +; THUMBM-NEXT: movs r1, #5 +; THUMBM-NEXT: bl __atomic_load_8 +; THUMBM-NEXT: pop {r7, pc} + %val = load atomic double, ptr %ptr seq_cst, align 8 + ret double %val +} + +define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) { +; ARM-LABEL: store_atomic_f16__seq_cst: +; ARM: @ %bb.0: +; ARM-NEXT: dmb ish +; ARM-NEXT: strh r1, [r0] +; ARM-NEXT: dmb ish +; ARM-NEXT: bx lr +; +; ARMOPTNONE-LABEL: store_atomic_f16__seq_cst: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: sub sp, sp, #4 +; ARMOPTNONE-NEXT: str r1, [sp] @ 4-byte Spill +; ARMOPTNONE-NEXT: mov r1, r0 +; ARMOPTNONE-NEXT: ldr r0, [sp] @ 4-byte Reload +; ARMOPTNONE-NEXT: vmov s0, r0 +; ARMOPTNONE-NEXT: vmov r0, s0 +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: strh r0, [r1] +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: add sp, sp, #4 +; ARMOPTNONE-NEXT: bx lr +; +; THUMBTWO-LABEL: store_atomic_f16__seq_cst: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: strh r1, [r0] +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: store_atomic_f16__seq_cst: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: bl __sync_lock_test_and_set_2 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: store_atomic_f16__seq_cst: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: mov r2, #5 +; ARMV4-NEXT: bl __atomic_store_2 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: store_atomic_f16__seq_cst: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: mov r2, #0 +; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5 +; ARMV6-NEXT: strh r1, [r0] +; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5 +; ARMV6-NEXT: bx lr +; +; THUMBM-LABEL: store_atomic_f16__seq_cst: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: strh r1, [r0] +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: bx lr + store atomic half %val1, ptr %ptr seq_cst, align 2 + ret void +} + +define void @store_atomic_bf16__seq_cst(ptr %ptr, bfloat %val1) { +; ARM-LABEL: store_atomic_bf16__seq_cst: +; ARM: @ %bb.0: +; ARM-NEXT: dmb ish +; ARM-NEXT: strh r1, [r0] +; ARM-NEXT: dmb ish +; ARM-NEXT: bx lr +; +; ARMOPTNONE-LABEL: store_atomic_bf16__seq_cst: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: sub sp, sp, #4 +; ARMOPTNONE-NEXT: str r1, [sp] @ 4-byte Spill +; ARMOPTNONE-NEXT: mov r1, r0 +; ARMOPTNONE-NEXT: ldr r0, [sp] @ 4-byte Reload +; ARMOPTNONE-NEXT: vmov s0, r0 +; ARMOPTNONE-NEXT: vmov r0, s0 +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: strh r0, [r1] +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: add sp, sp, #4 +; ARMOPTNONE-NEXT: bx lr +; +; THUMBTWO-LABEL: store_atomic_bf16__seq_cst: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: strh r1, [r0] +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: store_atomic_bf16__seq_cst: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: bl __sync_lock_test_and_set_2 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: store_atomic_bf16__seq_cst: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: mov r2, #5 +; ARMV4-NEXT: bl __atomic_store_2 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: store_atomic_bf16__seq_cst: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: mov r2, #0 +; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5 +; ARMV6-NEXT: strh r1, [r0] +; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5 +; ARMV6-NEXT: bx lr +; +; THUMBM-LABEL: store_atomic_bf16__seq_cst: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: strh r1, [r0] +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: bx lr + store atomic bfloat %val1, ptr %ptr seq_cst, align 2 + ret void +} + +define void @store_atomic_f32__seq_cst(ptr %ptr, float %val1) { +; ARM-LABEL: store_atomic_f32__seq_cst: +; ARM: @ %bb.0: +; ARM-NEXT: dmb ish +; ARM-NEXT: str r1, [r0] +; ARM-NEXT: dmb ish +; ARM-NEXT: bx lr +; +; ARMOPTNONE-LABEL: store_atomic_f32__seq_cst: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: sub sp, sp, #4 +; ARMOPTNONE-NEXT: str r1, [sp] @ 4-byte Spill +; ARMOPTNONE-NEXT: mov r1, r0 +; ARMOPTNONE-NEXT: ldr r0, [sp] @ 4-byte Reload +; ARMOPTNONE-NEXT: vmov s0, r0 +; ARMOPTNONE-NEXT: vmov r0, s0 +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: str r0, [r1] +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: add sp, sp, #4 +; ARMOPTNONE-NEXT: bx lr +; +; THUMBTWO-LABEL: store_atomic_f32__seq_cst: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: str r1, [r0] +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: store_atomic_f32__seq_cst: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: bl __sync_lock_test_and_set_4 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: store_atomic_f32__seq_cst: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: mov r2, #5 +; ARMV4-NEXT: bl __atomic_store_4 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: store_atomic_f32__seq_cst: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: mov r2, #0 +; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5 +; ARMV6-NEXT: str r1, [r0] +; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5 +; ARMV6-NEXT: bx lr +; +; THUMBM-LABEL: store_atomic_f32__seq_cst: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: str r1, [r0] +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: bx lr + store atomic float %val1, ptr %ptr seq_cst, align 4 + ret void +} + +define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) { +; ARM-LABEL: store_atomic_f64__seq_cst: +; ARM: @ %bb.0: +; ARM-NEXT: push {r4, r5, lr} +; ARM-NEXT: mov r3, r2 +; ARM-NEXT: dmb ish +; ARM-NEXT: mov r2, r1 +; ARM-NEXT: LBB13_1: @ %atomicrmw.start +; ARM-NEXT: @ =>This Inner Loop Header: Depth=1 +; ARM-NEXT: ldrexd r4, r5, [r0] +; ARM-NEXT: strexd r1, r2, r3, [r0] +; ARM-NEXT: cmp r1, #0 +; ARM-NEXT: bne LBB13_1 +; ARM-NEXT: @ %bb.2: @ %atomicrmw.end +; ARM-NEXT: dmb ish +; ARM-NEXT: pop {r4, r5, pc} +; +; ARMOPTNONE-LABEL: store_atomic_f64__seq_cst: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: push {r4, r5, r7, lr} +; ARMOPTNONE-NEXT: add r7, sp, #8 +; ARMOPTNONE-NEXT: push {r8, r10, r11} +; ARMOPTNONE-NEXT: sub sp, sp, #20 +; ARMOPTNONE-NEXT: str r0, [sp] @ 4-byte Spill +; ARMOPTNONE-NEXT: vmov d16, r1, r2 +; ARMOPTNONE-NEXT: vmov r1, r2, d16 +; ARMOPTNONE-NEXT: str r2, [sp, #4] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r1, [sp, #8] @ 4-byte Spill +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: ldr r1, [r0] +; ARMOPTNONE-NEXT: ldr r0, [r0, #4] +; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r0, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: b LBB13_1 +; ARMOPTNONE-NEXT: LBB13_1: @ %atomicrmw.start +; ARMOPTNONE-NEXT: @ =>This Loop Header: Depth=1 +; ARMOPTNONE-NEXT: @ Child Loop BB13_2 Depth 2 +; ARMOPTNONE-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r3, [sp] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r10, [sp, #8] @ 4-byte Reload +; ARMOPTNONE-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; ARMOPTNONE-NEXT: mov r11, r0 +; ARMOPTNONE-NEXT: mov r8, r2 +; ARMOPTNONE-NEXT: mov r9, r1 +; ARMOPTNONE-NEXT: LBB13_2: @ %atomicrmw.start +; ARMOPTNONE-NEXT: @ Parent Loop BB13_1 Depth=1 +; ARMOPTNONE-NEXT: @ => This Inner Loop Header: Depth=2 +; ARMOPTNONE-NEXT: ldrexd r4, r5, [r3] +; ARMOPTNONE-NEXT: cmp r4, r8 +; ARMOPTNONE-NEXT: cmpeq r5, r9 +; ARMOPTNONE-NEXT: bne LBB13_4 +; ARMOPTNONE-NEXT: @ %bb.3: @ %atomicrmw.start +; ARMOPTNONE-NEXT: @ in Loop: Header=BB13_2 Depth=2 +; ARMOPTNONE-NEXT: strexd r0, r10, r11, [r3] +; ARMOPTNONE-NEXT: cmp r0, #0 +; ARMOPTNONE-NEXT: bne LBB13_2 +; ARMOPTNONE-NEXT: LBB13_4: @ %atomicrmw.start +; ARMOPTNONE-NEXT: @ in Loop: Header=BB13_1 Depth=1 +; ARMOPTNONE-NEXT: mov r0, r5 +; ARMOPTNONE-NEXT: eor r3, r0, r1 +; ARMOPTNONE-NEXT: mov r1, r4 +; ARMOPTNONE-NEXT: eor r2, r1, r2 +; ARMOPTNONE-NEXT: orr r2, r2, r3 +; ARMOPTNONE-NEXT: cmp r2, #0 +; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r0, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: bne LBB13_1 +; ARMOPTNONE-NEXT: b LBB13_5 +; ARMOPTNONE-NEXT: LBB13_5: @ %atomicrmw.end +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: sub sp, r7, #20 +; ARMOPTNONE-NEXT: pop {r8, r10, r11} +; ARMOPTNONE-NEXT: pop {r4, r5, r7, pc} +; +; THUMBTWO-LABEL: store_atomic_f64__seq_cst: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: LBB13_1: @ %atomicrmw.start +; THUMBTWO-NEXT: @ =>This Inner Loop Header: Depth=1 +; THUMBTWO-NEXT: ldrexd r3, r9, [r0] +; THUMBTWO-NEXT: strexd r3, r1, r2, [r0] +; THUMBTWO-NEXT: cmp r3, #0 +; THUMBTWO-NEXT: bne LBB13_1 +; THUMBTWO-NEXT: @ %bb.2: @ %atomicrmw.end +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: store_atomic_f64__seq_cst: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: bl __sync_lock_test_and_set_8 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: store_atomic_f64__seq_cst: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: sub sp, sp, #8 +; ARMV4-NEXT: mov r1, #5 +; ARMV4-NEXT: str r1, [sp] +; ARMV4-NEXT: bl __atomic_store_8 +; ARMV4-NEXT: add sp, sp, #8 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: store_atomic_f64__seq_cst: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: push {r4, r5, r11, lr} +; ARMV6-NEXT: @ kill: def $r3 killed $r3 killed $r2_r3 def $r2_r3 +; ARMV6-NEXT: mov r1, #0 +; ARMV6-NEXT: @ kill: def $r2 killed $r2 killed $r2_r3 def $r2_r3 +; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5 +; ARMV6-NEXT: .LBB13_1: @ %atomicrmw.start +; ARMV6-NEXT: @ =>This Inner Loop Header: Depth=1 +; ARMV6-NEXT: ldrexd r4, r5, [r0] +; ARMV6-NEXT: strexd r1, r2, r3, [r0] +; ARMV6-NEXT: cmp r1, #0 +; ARMV6-NEXT: bne .LBB13_1 +; ARMV6-NEXT: @ %bb.2: @ %atomicrmw.end +; ARMV6-NEXT: mov r0, #0 +; ARMV6-NEXT: mcr p15, #0, r0, c7, c10, #5 +; ARMV6-NEXT: pop {r4, r5, r11, pc} +; +; THUMBM-LABEL: store_atomic_f64__seq_cst: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: push {r7, lr} +; THUMBM-NEXT: sub sp, #8 +; THUMBM-NEXT: movs r1, #5 +; THUMBM-NEXT: str r1, [sp] +; THUMBM-NEXT: bl __atomic_store_8 +; THUMBM-NEXT: add sp, #8 +; THUMBM-NEXT: pop {r7, pc} + store atomic double %val1, ptr %ptr seq_cst, align 8 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll index 04cdbe9d7e7859..ff5bec53acd257 100644 --- a/llvm/test/CodeGen/PowerPC/atomics.ll +++ b/llvm/test/CodeGen/PowerPC/atomics.ll @@ -462,3 +462,212 @@ define i64 @and_i64_release(ptr %mem, i64 %operand) { %val = atomicrmw and ptr %mem, i64 %operand release ret i64 %val } + +define half @load_atomic_f16__seq_cst(ptr %ptr) { +; PPC32-LABEL: load_atomic_f16__seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: stw r0, 20(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: sync +; PPC32-NEXT: lhz r3, 0(r3) +; PPC32-NEXT: cmpw cr7, r3, r3 +; PPC32-NEXT: bne- cr7, .+4 +; PPC32-NEXT: isync +; PPC32-NEXT: bl __gnu_h2f_ieee +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: load_atomic_f16__seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: mflr r0 +; PPC64-NEXT: stdu r1, -112(r1) +; PPC64-NEXT: std r0, 128(r1) +; PPC64-NEXT: .cfi_def_cfa_offset 112 +; PPC64-NEXT: .cfi_offset lr, 16 +; PPC64-NEXT: sync +; PPC64-NEXT: lhz r3, 0(r3) +; PPC64-NEXT: cmpd cr7, r3, r3 +; PPC64-NEXT: bne- cr7, .+4 +; PPC64-NEXT: isync +; PPC64-NEXT: bl __gnu_h2f_ieee +; PPC64-NEXT: nop +; PPC64-NEXT: addi r1, r1, 112 +; PPC64-NEXT: ld r0, 16(r1) +; PPC64-NEXT: mtlr r0 +; PPC64-NEXT: blr + %val = load atomic half, ptr %ptr seq_cst, align 2 + ret half %val +} + +; FIXME: bf16_to_fp fails to select +; define bfloat @load_atomic_bf16__seq_cst(ptr %ptr) { +; %val = load atomic bfloat, ptr %ptr seq_cst, align 2 +; ret bfloat %val +; } + +define float @load_atomic_f32__seq_cst(ptr %ptr) { +; PPC32-LABEL: load_atomic_f32__seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: sync +; PPC32-NEXT: lwz r3, 0(r3) +; PPC32-NEXT: cmpw cr7, r3, r3 +; PPC32-NEXT: bne- cr7, .+4 +; PPC32-NEXT: isync +; PPC32-NEXT: stw r3, 12(r1) +; PPC32-NEXT: lfs f1, 12(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: blr +; +; PPC64-LABEL: load_atomic_f32__seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: sync +; PPC64-NEXT: lwz r3, 0(r3) +; PPC64-NEXT: cmpd cr7, r3, r3 +; PPC64-NEXT: bne- cr7, .+4 +; PPC64-NEXT: isync +; PPC64-NEXT: stw r3, -4(r1) +; PPC64-NEXT: lfs f1, -4(r1) +; PPC64-NEXT: blr + %val = load atomic float, ptr %ptr seq_cst, align 4 + ret float %val +} + +define double @load_atomic_f64__seq_cst(ptr %ptr) { +; PPC32-LABEL: load_atomic_f64__seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: stw r0, 20(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: li r4, 5 +; PPC32-NEXT: bl __atomic_load_8 +; PPC32-NEXT: stw r4, 12(r1) +; PPC32-NEXT: stw r3, 8(r1) +; PPC32-NEXT: lfd f1, 8(r1) +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: load_atomic_f64__seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: sync +; PPC64-NEXT: ld r3, 0(r3) +; PPC64-NEXT: cmpd cr7, r3, r3 +; PPC64-NEXT: bne- cr7, .+4 +; PPC64-NEXT: isync +; PPC64-NEXT: std r3, -8(r1) +; PPC64-NEXT: lfd f1, -8(r1) +; PPC64-NEXT: blr + %val = load atomic double, ptr %ptr seq_cst, align 8 + ret double %val +} + +define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) { +; PPC32-LABEL: store_atomic_f16__seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: stw r0, 20(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: .cfi_offset r30, -8 +; PPC32-NEXT: stw r30, 8(r1) # 4-byte Folded Spill +; PPC32-NEXT: mr r30, r3 +; PPC32-NEXT: bl __gnu_f2h_ieee +; PPC32-NEXT: sync +; PPC32-NEXT: sth r3, 0(r30) +; PPC32-NEXT: lwz r30, 8(r1) # 4-byte Folded Reload +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: store_atomic_f16__seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: mflr r0 +; PPC64-NEXT: stdu r1, -128(r1) +; PPC64-NEXT: std r0, 144(r1) +; PPC64-NEXT: .cfi_def_cfa_offset 128 +; PPC64-NEXT: .cfi_offset lr, 16 +; PPC64-NEXT: .cfi_offset r30, -16 +; PPC64-NEXT: std r30, 112(r1) # 8-byte Folded Spill +; PPC64-NEXT: mr r30, r3 +; PPC64-NEXT: bl __gnu_f2h_ieee +; PPC64-NEXT: nop +; PPC64-NEXT: sync +; PPC64-NEXT: sth r3, 0(r30) +; PPC64-NEXT: ld r30, 112(r1) # 8-byte Folded Reload +; PPC64-NEXT: addi r1, r1, 128 +; PPC64-NEXT: ld r0, 16(r1) +; PPC64-NEXT: mtlr r0 +; PPC64-NEXT: blr + store atomic half %val1, ptr %ptr seq_cst, align 2 + ret void +} + +; FIXME: bf16_to_fp fails to select +; define void @store_atomic_bf16__seq_cst(ptr %ptr, bfloat %val1) { +; store atomic bfloat %val1, ptr %ptr seq_cst, align 2 +; ret void +; } + +define void @store_atomic_f32__seq_cst(ptr %ptr, float %val1) { +; PPC32-LABEL: store_atomic_f32__seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: stfs f1, 12(r1) +; PPC32-NEXT: lwz r4, 12(r1) +; PPC32-NEXT: sync +; PPC32-NEXT: stw r4, 0(r3) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: blr +; +; PPC64-LABEL: store_atomic_f32__seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: stfs f1, -4(r1) +; PPC64-NEXT: lwz r4, -4(r1) +; PPC64-NEXT: sync +; PPC64-NEXT: stw r4, 0(r3) +; PPC64-NEXT: blr + store atomic float %val1, ptr %ptr seq_cst, align 4 + ret void +} + +define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) { +; PPC32-LABEL: store_atomic_f64__seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: stw r0, 20(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: stfd f1, 8(r1) +; PPC32-NEXT: li r7, 5 +; PPC32-NEXT: lwz r5, 8(r1) +; PPC32-NEXT: lwz r6, 12(r1) +; PPC32-NEXT: bl __atomic_store_8 +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: store_atomic_f64__seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: stfd f1, -8(r1) +; PPC64-NEXT: ld r4, -8(r1) +; PPC64-NEXT: sync +; PPC64-NEXT: std r4, 0(r3) +; PPC64-NEXT: blr + store atomic double %val1, ptr %ptr seq_cst, align 8 + ret void +} diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll index 9995e7d3a4d314..d7633cb11e44c1 100644 --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -787,3 +787,100 @@ define double @load_double_seq_cst(ptr %fptr) { %v = load atomic double, ptr %fptr seq_cst, align 8 ret double %v } + +define void @store_bfloat(ptr %fptr, bfloat %v) { +; X86-LABEL: store_bfloat: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movw %cx, (%eax) +; X86-NEXT: retl +; +; X64-SSE-LABEL: store_bfloat: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pextrw $0, %xmm0, %eax +; X64-SSE-NEXT: movw %ax, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: store_bfloat: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpextrw $0, %xmm0, %eax +; X64-AVX-NEXT: movw %ax, (%rdi) +; X64-AVX-NEXT: retq + store atomic bfloat %v, ptr %fptr unordered, align 2 + ret void +} + +; Work around issue #92899 by casting to float +define float @load_bfloat(ptr %fptr) { +; X86-SSE1-LABEL: load_bfloat: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %eax +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movzwl (%eax), %eax +; X86-SSE1-NEXT: shll $16, %eax +; X86-SSE1-NEXT: movl %eax, (%esp) +; X86-SSE1-NEXT: flds (%esp) +; X86-SSE1-NEXT: popl %eax +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: load_bfloat: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %eax +; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movzwl (%eax), %eax +; X86-SSE2-NEXT: shll $16, %eax +; X86-SSE2-NEXT: movd %eax, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, (%esp) +; X86-SSE2-NEXT: flds (%esp) +; X86-SSE2-NEXT: popl %eax +; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE2-NEXT: retl +; +; X86-AVX-LABEL: load_bfloat: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movzwl (%eax), %eax +; X86-AVX-NEXT: shll $16, %eax +; X86-AVX-NEXT: vmovd %eax, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, (%esp) +; X86-AVX-NEXT: flds (%esp) +; X86-AVX-NEXT: popl %eax +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; +; X86-NOSSE-LABEL: load_bfloat: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %eax +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movzwl (%eax), %eax +; X86-NOSSE-NEXT: shll $16, %eax +; X86-NOSSE-NEXT: movl %eax, (%esp) +; X86-NOSSE-NEXT: flds (%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4 +; X86-NOSSE-NEXT: retl +; +; X64-SSE-LABEL: load_bfloat: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movzwl (%rdi), %eax +; X64-SSE-NEXT: shll $16, %eax +; X64-SSE-NEXT: movd %eax, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: load_bfloat: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: movzwl (%rdi), %eax +; X64-AVX-NEXT: shll $16, %eax +; X64-AVX-NEXT: vmovd %eax, %xmm0 +; X64-AVX-NEXT: retq + %v = load atomic bfloat, ptr %fptr unordered, align 2 + %ext = fpext bfloat %v to float + ret float %ext +} From 3613b2683107bd60fda6d9348623be0686f6d7e3 Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Wed, 29 May 2024 06:13:02 +0100 Subject: [PATCH 059/230] Constant Fold logf128 calls (#90611) This is a second attempt to land #84501 which failed on several targets. This patch adds the HAS_IEE754_FLOAT128 define which makes the check for typedef'ing float128 more precise by checking whether __uint128_t is available and checking if the host does not use __ibm128 which is prevalent on power pc targets and replaces IEEE754 float128s. --- llvm/CMakeLists.txt | 2 + llvm/cmake/config-ix.cmake | 11 ++ llvm/include/llvm/ADT/APFloat.h | 13 ++ llvm/include/llvm/ADT/APInt.h | 8 ++ llvm/include/llvm/Config/llvm-config.h.cmake | 3 + llvm/include/llvm/Support/float128.h | 26 ++++ llvm/lib/Analysis/CMakeLists.txt | 6 + llvm/lib/Analysis/ConstantFolding.cpp | 11 ++ llvm/lib/Support/APFloat.cpp | 24 ++++ llvm/test/CMakeLists.txt | 1 + .../InstSimplify/ConstProp/logf128.ll | 126 ++++++++++++++++++ llvm/test/lit.cfg.py | 3 + llvm/test/lit.site.cfg.py.in | 1 + 13 files changed, 235 insertions(+) create mode 100644 llvm/include/llvm/Support/float128.h create mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 612e90abd40913..64898ab09772f4 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -560,6 +560,8 @@ set(LLVM_USE_STATIC_ZSTD FALSE CACHE BOOL "Use static version of zstd. Can be TR set(LLVM_ENABLE_CURL "OFF" CACHE STRING "Use libcurl for the HTTP client if available. Can be ON, OFF, or FORCE_ON") +set(LLVM_HAS_LOGF128 "OFF" CACHE STRING "Use logf128 to constant fold fp128 logarithm calls. Can be ON, OFF, or FORCE_ON") + set(LLVM_ENABLE_HTTPLIB "OFF" CACHE STRING "Use cpp-httplib HTTP server library if available. Can be ON, OFF, or FORCE_ON") set(LLVM_Z3_INSTALL_DIR "" CACHE STRING "Install directory of the Z3 solver.") diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 8cfb36b0194e85..0aae13e30f2ab4 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -247,6 +247,17 @@ else() set(HAVE_LIBEDIT 0) endif() +if(LLVM_HAS_LOGF128) + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(logf128 math.h HAS_LOGF128) + + if(LLVM_HAS_LOGF128 STREQUAL FORCE_ON AND NOT HAS_LOGF128) + message(FATAL_ERROR "Failed to configure logf128") + endif() + + set(LLVM_HAS_LOGF128 "${HAS_LOGF128}") +endif() + # function checks check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM) find_package(Backtrace) diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index deb74cb2fdeb1e..44a301ecc99280 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -19,6 +19,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/float128.h" #include #define APFLOAT_DISPATCH_ON_SEMANTICS(METHOD_CALL) \ @@ -354,6 +355,9 @@ class IEEEFloat final : public APFloatBase { Expected convertFromString(StringRef, roundingMode); APInt bitcastToAPInt() const; double convertToDouble() const; +#ifdef HAS_IEE754_FLOAT128 + float128 convertToQuad() const; +#endif float convertToFloat() const; /// @} @@ -1218,6 +1222,15 @@ class APFloat : public APFloatBase { /// shorter semantics, like IEEEsingle and others. double convertToDouble() const; + /// Converts this APFloat to host float value. + /// + /// \pre The APFloat must be built using semantics, that can be represented by + /// the host float type without loss of precision. It can be IEEEquad and + /// shorter semantics, like IEEEdouble and others. +#ifdef HAS_IEE754_FLOAT128 + float128 convertToQuad() const; +#endif + /// Converts this APFloat to host float value. /// /// \pre The APFloat must be built using semantics, that can be represented by diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 2fd8b7ea636c4a..6cfa6ec6650842 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -17,6 +17,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/float128.h" #include #include #include @@ -1677,6 +1678,13 @@ class [[nodiscard]] APInt { /// any bit width. Exactly 64 bits will be translated. double bitsToDouble() const { return llvm::bit_cast(getWord(0)); } +#ifdef HAS_IEE754_FLOAT128 + float128 bitsToQuad() const { + __uint128_t ul = ((__uint128_t)U.pVal[1] << 64) + U.pVal[0]; + return llvm::bit_cast(ul); + } +#endif + /// Converts APInt bits to a float /// /// The conversion does not do a translation from integer to float, it just diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake index 6605ea60df99e1..629977cc11d683 100644 --- a/llvm/include/llvm/Config/llvm-config.h.cmake +++ b/llvm/include/llvm/Config/llvm-config.h.cmake @@ -198,4 +198,7 @@ /* Define if plugins enabled */ #cmakedefine LLVM_ENABLE_PLUGINS +/* Define if logf128 is available */ +#cmakedefine LLVM_HAS_LOGF128 + #endif diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h new file mode 100644 index 00000000000000..e15a98dc5a6779 --- /dev/null +++ b/llvm/include/llvm/Support/float128.h @@ -0,0 +1,26 @@ +//===-- llvm/Support/float128.h - Compiler abstraction support --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FLOAT128 +#define LLVM_FLOAT128 + +namespace llvm { + +#if defined(__clang__) && defined(__FLOAT128__) && \ + defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__) +#define HAS_IEE754_FLOAT128 +typedef __float128 float128; +#elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) && \ + !defined(__LONG_DOUBLE_IBM128__) && \ + (defined(__GNUC__) || defined(__GNUG__)) +#define HAS_IEE754_FLOAT128 +typedef _Float128 float128; +#endif + +} // namespace llvm +#endif // LLVM_FLOAT128 diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 474b8d20fde16f..74476cb5440c61 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -159,3 +159,9 @@ add_llvm_component_library(LLVMAnalysis Support TargetParser ) + +include(CheckCXXSymbolExists) +check_cxx_symbol_exists(logf128 math.h HAS_LOGF128) +if(HAS_LOGF128) + target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128) +endif() diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 705377b97ed903..5febe917126b1d 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2087,6 +2087,17 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, if (IntrinsicID == Intrinsic::canonicalize) return constantFoldCanonicalize(Ty, Call, U); +#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128) + if (Ty->isFP128Ty()) { + switch (IntrinsicID) { + default: + return nullptr; + case Intrinsic::log: + return ConstantFP::get(Ty, logf128(Op->getValueAPF().convertToQuad())); + } + } +#endif + if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy()) return nullptr; diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 2a9b3903720be1..283fcc153b33aa 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -3665,6 +3665,15 @@ double IEEEFloat::convertToDouble() const { return api.bitsToDouble(); } +#ifdef HAS_IEE754_FLOAT128 +float128 IEEEFloat::convertToQuad() const { + assert(semantics == (const llvm::fltSemantics *)&semIEEEquad && + "Float semantics are not IEEEquads"); + APInt api = bitcastToAPInt(); + return api.bitsToQuad(); +} +#endif + /// Integer bit is explicit in this format. Intel hardware (387 and later) /// does not support these bit patterns: /// exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity") @@ -5260,6 +5269,21 @@ double APFloat::convertToDouble() const { return Temp.getIEEE().convertToDouble(); } +#ifdef HAS_IEE754_FLOAT128 +float128 APFloat::convertToQuad() const { + if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad) + return getIEEE().convertToQuad(); + assert(getSemantics().isRepresentableBy(semIEEEquad) && + "Float semantics is not representable by IEEEquad"); + APFloat Temp = *this; + bool LosesInfo; + opStatus St = Temp.convert(semIEEEquad, rmNearestTiesToEven, &LosesInfo); + assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision"); + (void)St; + return Temp.getIEEE().convertToQuad(); +} +#endif + float APFloat::convertToFloat() const { if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle) return getIEEE().convertToFloat(); diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index c942339e43608e..2f466c258f6771 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -26,6 +26,7 @@ llvm_canonicalize_cmake_booleans( LLVM_TOOL_LLVM_DRIVER_BUILD LLVM_INCLUDE_SPIRV_TOOLS_TESTS LLVM_APPEND_VC_REV + LLVM_HAS_LOGF128 ) configure_lit_site_cfg( diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll b/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll new file mode 100644 index 00000000000000..da56997f693822 --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll @@ -0,0 +1,126 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=instsimplify -S | FileCheck %s + +; REQUIRES: has_logf128 +declare fp128 @llvm.log.f128(fp128) + +define fp128 @log_e_64(){ +; CHECK-LABEL: define fp128 @log_e_64() { +; CHECK-NEXT: ret fp128 0xL300000000000000040010A2B23F3BAB7 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000004005000000000000) + ret fp128 %A +} + +define fp128 @log_e_smallest_positive_subnormal_number(){ +; CHECK-LABEL: define fp128 @log_e_smallest_positive_subnormal_number() { +; CHECK-NEXT: ret fp128 0xL3000000000000000C00C654628220780 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000010000000000000000) + ret fp128 %A +} + +define fp128 @log_e_largest_subnormal_number(){ +; CHECK-LABEL: define fp128 @log_e_largest_subnormal_number() { +; CHECK-NEXT: ret fp128 0xLD000000000000000C00C62D918CE2421 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xLFFFFFFFFFFFFFFFF0000FFFFFFFFFFFF) + ret fp128 %A +} + +define fp128 @log_e_smallest_positive_normal_number(){ +; +; CHECK-LABEL: define fp128 @log_e_smallest_positive_normal_number() { +; CHECK-NEXT: ret fp128 0xLD000000000000000C00C62D918CE2421 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000000001000000000000) + ret fp128 %A +} + +define fp128 @log_e_largest_normal_number(){ +; CHECK-LABEL: define fp128 @log_e_largest_normal_number() { +; CHECK-NEXT: ret fp128 0xLF000000000000000400C62E42FEFA39E +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xLFFFFFFFFFFFFFFFF7FFEFFFFFFFFFFFF) + ret fp128 %A +} + +define fp128 @log_e_largest_number_less_than_one(){ +; CHECK-LABEL: define fp128 @log_e_largest_number_less_than_one() { +; CHECK-NEXT: ret fp128 0xL0000000000000000BF8E000000000000 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xLFFFFFFFFFFFFFFFF3FFEFFFFFFFFFFFF) + ret fp128 %A +} + +define fp128 @log_e_1(){ +; CHECK-LABEL: define fp128 @log_e_1() { +; CHECK-NEXT: ret fp128 0xL00000000000000000000000000000000 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000003FFF000000000000) + ret fp128 %A +} + +define fp128 @log_e_smallest_number_larger_than_one(){ +; CHECK-LABEL: define fp128 @log_e_smallest_number_larger_than_one() { +; CHECK-NEXT: ret fp128 0xL00000000000000003F8F000000000000 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000013FFF000000000000) + ret fp128 %A +} + +define fp128 @log_e_negative_2(){ +; CHECK-LABEL: define fp128 @log_e_negative_2() { +; CHECK-NEXT: ret fp128 0xL00000000000000007FFF800000000000 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL0000000000000000C000000000000000) + ret fp128 %A +} + +define fp128 @log_e_0(){ +; CHECK-LABEL: define fp128 @log_e_0() { +; CHECK-NEXT: ret fp128 0xL0000000000000000FFFF000000000000 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000000000000000000000) + ret fp128 %A +} + +define fp128 @log_e_negative_0(){ +; CHECK-LABEL: define fp128 @log_e_negative_0() { +; CHECK-NEXT: ret fp128 0xL0000000000000000FFFF000000000000 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000008000000000000000) + ret fp128 %A +} + +define fp128 @log_e_infinity(){ +; CHECK-LABEL: define fp128 @log_e_infinity() { +; CHECK-NEXT: ret fp128 0xL00000000000000007FFF000000000000 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000007FFF000000000000) + ret fp128 %A +} + +define fp128 @log_e_negative_infinity(){ +; CHECK-LABEL: define fp128 @log_e_negative_infinity() { +; CHECK-NEXT: ret fp128 0xL00000000000000007FFF800000000000 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL0000000000000000FFFF000000000000) + ret fp128 %A +} + +define fp128 @log_e_nan(){ +; CHECK-LABEL: define fp128 @log_e_nan() { +; CHECK-NEXT: ret fp128 0xL00000000000000007FFF800000000001 +; + %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000007FFF000000000001) + ret fp128 %A +} + +define <2 x fp128> @log_e_negative_2_vector(){ +; CHECK-LABEL: define <2 x fp128> @log_e_negative_2_vector() { +; CHECK-NEXT: ret <2 x fp128> +; + %A = call <2 x fp128> @llvm.log.v2f128(<2 x fp128> ) + ret <2 x fp128> %A +} diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index affd87b98c1410..fe1262893212fb 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -617,3 +617,6 @@ def have_ld64_plugin_support(): # "OBJECT_MODE" to 'any' by default on AIX OS. if "system-aix" in config.available_features: config.environment["OBJECT_MODE"] = "any" + +if config.has_logf128: + config.available_features.add("has_logf128") diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index 60a68b0edaf933..0968f6214772d0 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -63,6 +63,7 @@ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@ config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@ config.have_vc_rev = @LLVM_APPEND_VC_REV@ config.force_vc_rev = "@LLVM_FORCE_VC_REVISION@" +config.has_logf128 = @LLVM_HAS_LOGF128@ import lit.llvm lit.llvm.initialize(lit_config, config) From b0f10a1dc34aa1b73faeeabdc2d348074a02c75d Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Wed, 29 May 2024 13:39:57 +0800 Subject: [PATCH 060/230] [C++20] [Modules] Don't generate the defintition for non-const available external variables (#93530) Close https://github.com/llvm/llvm-project/issues/93497 The root cause of the problem is, we mark the variable from other modules as constnant in LLVM incorrectly. This patch fixes this problem by not emitting the defintition for non-const available external variables. Since the non const available externally variable is not helpful to the optimization. --- clang/lib/CodeGen/CodeGenModule.cpp | 12 +++ clang/test/CodeGenCXX/partitions.cpp | 8 +- clang/test/Modules/pr93497.cppm | 106 +++++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 clang/test/Modules/pr93497.cppm diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index e4774a587707ac..0b0b659e1fd490 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -5341,6 +5341,18 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D, !IsDefinitionAvailableExternally && D->needsDestruction(getContext()) == QualType::DK_cxx_destructor; + // It is helpless to emit the definition for an available_externally variable + // which can't be marked as const. + // We don't need to check if it needs global ctor or dtor. See the above + // comment for ideas. + if (IsDefinitionAvailableExternally && + (!D->hasConstantInitialization() || + // TODO: Update this when we have interface to check constexpr + // destructor. + D->needsDestruction(getContext()) || + !D->getType().isConstantStorage(getContext(), true, true))) + return; + const VarDecl *InitDecl; const Expr *InitExpr = D->getAnyInitializer(InitDecl); diff --git a/clang/test/CodeGenCXX/partitions.cpp b/clang/test/CodeGenCXX/partitions.cpp index d283dd071f6b28..e80e68f82974bd 100644 --- a/clang/test/CodeGenCXX/partitions.cpp +++ b/clang/test/CodeGenCXX/partitions.cpp @@ -40,12 +40,12 @@ export int use() { } // FIXME: The definition of the variables shouldn't be exported too. -// CHECK: @_ZW3mod1a = available_externally global -// CHECK: @_ZW3mod1b = available_externally global +// CHECK: @_ZW3mod1a = external global +// CHECK: @_ZW3mod1b = external global // CHECK: declare{{.*}} i32 @_ZW3mod3foov // CHECK: declare{{.*}} i32 @_ZW3mod3barv -// CHECK-OPT: @_ZW3mod1a = available_externally global -// CHECK-OPT: @_ZW3mod1b = available_externally global +// CHECK-OPT: @_ZW3mod1a = external global +// CHECK-OPT: @_ZW3mod1b = external global // CHECK-OPT: declare{{.*}} i32 @_ZW3mod3foov // CHECK-OPT: declare{{.*}} i32 @_ZW3mod3barv diff --git a/clang/test/Modules/pr93497.cppm b/clang/test/Modules/pr93497.cppm new file mode 100644 index 00000000000000..64a08e2a85e63e --- /dev/null +++ b/clang/test/Modules/pr93497.cppm @@ -0,0 +1,106 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t + +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/mod.cppm \ +// RUN: -emit-module-interface -o %t/mod.pcm +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/use.cpp \ +// RUN: -fmodule-file=mod=%t/mod.pcm -emit-llvm \ +// RUN: -o - | opt -S --passes=simplifycfg | FileCheck %t/use.cpp + +//--- mod.cppm +export module mod; + +export struct Thing { + static const Thing One; + explicit Thing(int raw) :raw(raw) { } + int raw; +}; + +const Thing Thing::One = Thing(1); + +export struct C { + int value; +}; +export const C ConstantValue = {1}; + +export const C *ConstantPtr = &ConstantValue; + +C NonConstantValue = {1}; +export const C &ConstantRef = NonConstantValue; + +export struct NonConstexprDtor { + constexpr NonConstexprDtor(int raw) : raw(raw) {} + ~NonConstexprDtor(); + + int raw; +}; + +export const NonConstexprDtor NonConstexprDtorValue = {1}; + +//--- use.cpp +import mod; + +int consume(int); +int consumeC(C); + +extern "C" __attribute__((noinline)) inline int unneeded() { + return consume(43); +} + +extern "C" __attribute__((noinline)) inline int needed() { + return consume(43); +} + +int use() { + Thing t1 = Thing::One; + return consume(t1.raw); +} + +int use2() { + if (ConstantValue.value) + return consumeC(ConstantValue); + return unneeded(); +} + +int use3() { + auto Ptr = ConstantPtr; + if (Ptr->value) + return consumeC(*Ptr); + return needed(); +} + +int use4() { + auto Ref = ConstantRef; + if (Ref.value) + return consumeC(Ref); + return needed(); +} + +int use5() { + NonConstexprDtor V = NonConstexprDtorValue; + if (V.raw) + return consume(V.raw); + return needed(); +} + +// CHECK: @_ZNW3mod5Thing3OneE = external +// CHECK: @_ZW3mod13ConstantValue ={{.*}}available_externally{{.*}} constant +// CHECK: @_ZW3mod11ConstantPtr = external +// CHECK: @_ZW3mod16NonConstantValue = external +// CHECK: @_ZW3mod21NonConstexprDtorValue = external + +// Check that the middle end can optimize the program by the constant information. +// CHECK-NOT: @unneeded( + +// Check that the use of ConstantPtr won't get optimized incorrectly. +// CHECK-LABEL: @_Z4use3v( +// CHECK: @needed( + +// Check that the use of ConstantRef won't get optimized incorrectly. +// CHECK-LABEL: @_Z4use4v( +// CHECK: @needed( + +// Check that the use of NonConstexprDtorValue won't get optimized incorrectly. +// CHECK-LABEL: @_Z4use5v( +// CHECK: @needed( From 70d6b8a358366ec2ef4e73d5809fe23b9abf527d Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 28 May 2024 22:50:21 -0700 Subject: [PATCH 061/230] MCAsmParser: Amend \+ expansion Amend 7c956293d856224dd6a1b633820ef53009f7ef1c ("MCAsmParser: Support \+") to increase Macro.Count per iteration to match the new gas feature (milestone: 2.43). --- llvm/lib/MC/MCParser/AsmParser.cpp | 3 ++- llvm/test/MC/AsmParser/macro-at-pseudo-variable.s | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 2cddaf330b3bc5..8014ef9d039487 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -2580,7 +2580,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro, OS << NumOfMacroInstantiations; Pos += 2; } else if (Argument == "+") { - OS << Macro.Count++; + OS << Macro.Count; Pos += 2; } else { for (; Index < NParameters; ++Index) @@ -2629,6 +2629,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro, Body = Body.substr(Pos); } + ++Macro.Count; return false; } diff --git a/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s b/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s index a083b17aa54fe4..e1bb2298042096 100644 --- a/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s +++ b/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s @@ -74,15 +74,15 @@ #--- b.s .rept 2 - .print "r\+" + .print "r\+ \+" .endr .irpc foo,12 - .print "\+i" + .print "\+\+i" .endr -# CHECK2: r0 -# CHECK2-NEXT: r1 -# CHECK2-NEXT: 0i -# CHECK2-NEXT: 1i +# CHECK2: r0 0 +# CHECK2-NEXT: r1 1 +# CHECK2-NEXT: 00i +# CHECK2-NEXT: 11i .rept 2 .rept 2 From 5162027c9bb32ddbc8b37770c569a3e5a877d962 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 22:47:19 -0700 Subject: [PATCH 062/230] [RISCV] Add test for #93578. NFC --- llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll index 7bae84142d8ae6..8dbb57fd15cf16 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll @@ -934,3 +934,22 @@ define @vsra_vi_mask_nxv8i32( %va, %va, %vs ret %vc } + +; Negative test. We shouldn't look through the vp.trunc as it isn't vlmax like +; the rest of the code. +define @vsra_vv_nxv1i8_sext_zext_mixed_trunc( %va, %vb, %m, i32 %evl) { +; CHECK-LABEL: vsra_vv_nxv1i8_sext_zext_mixed_trunc: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 7 +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmin.vx v9, v8, a0 +; CHECK-NEXT: vsra.vv v8, v8, v9 +; CHECK-NEXT: ret + %sexted_va = sext %va to + %zexted_vb = zext %va to + %expand = ashr %sexted_va, %zexted_vb + %vc = trunc %expand to + %vd = call @llvm.vp.trunc.nxv1i8.nxvi16( %vc, %m, i32 %evl) + ret %vd +} +declare @llvm.vp.trunc.nxv1i8.nxvi16(, , i32) From 4e0bd3fab4b6a54342c9bed14f205895da3cf0d9 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Wed, 29 May 2024 14:10:01 +0800 Subject: [PATCH 063/230] [MachineLICM] Hoist copies of constant physical register (#93285) Previously, we just check if the source is a virtual register and this prevents some potential hoists. We can see some improvements in AArch64/RISCV tests. --- llvm/lib/CodeGen/MachineLICM.cpp | 5 +- .../AArch64/atomicrmw-uinc-udec-wrap.ll | 10 +- .../AArch64/dag-combine-concat-vectors.ll | 66 ++++---- .../machine-sink-cache-invalidation.ll | 10 +- .../AArch64/ragreedy-local-interval-cost.ll | 148 +++++++++--------- llvm/test/CodeGen/AMDGPU/amdpal-callable.ll | 7 +- llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 16 +- llvm/test/CodeGen/AVR/shift.ll | 6 +- .../RISCV/machinelicm-constant-phys-reg.ll | 41 +++++ .../RISCV/rvv/65704-illegal-instruction.ll | 21 +-- .../RISCV/rvv/fold-scalar-load-crash.ll | 48 +++--- llvm/test/CodeGen/RISCV/vlenb.ll | 5 +- 12 files changed, 218 insertions(+), 165 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 727a98c41bce4c..86eb259c090152 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -1269,8 +1269,9 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI, Register DefReg = MI.getOperand(0).getReg(); if (DefReg.isVirtual() && all_of(MI.uses(), - [](const MachineOperand &UseOp) { - return !UseOp.isReg() || UseOp.getReg().isVirtual(); + [this](const MachineOperand &UseOp) { + return !UseOp.isReg() || UseOp.getReg().isVirtual() || + MRI->isConstantPhysReg(UseOp.getReg()); }) && IsLoopInvariantInst(MI, CurLoop) && any_of(MRI->use_nodbg_instructions(DefReg), diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll index 5f293e5c7ea34f..66fea3535b1ec3 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll @@ -55,15 +55,15 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) { define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-LABEL: atomicrmw_uinc_wrap_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: .LBB3_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxr x0, [x8] -; CHECK-NEXT: cmp x0, x1 -; CHECK-NEXT: csinc x9, xzr, x0, hs -; CHECK-NEXT: stlxr w10, x9, [x8] +; CHECK-NEXT: ldaxr x8, [x0] +; CHECK-NEXT: cmp x8, x1 +; CHECK-NEXT: csinc x9, xzr, x8, hs +; CHECK-NEXT: stlxr w10, x9, [x0] ; CHECK-NEXT: cbnz w10, .LBB3_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i64 %val seq_cst ret i64 %result diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll index 83c7f73800af19..dfe0e83649e203 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll @@ -8,57 +8,57 @@ declare void @llvm.masked.scatter.nxv16i8.nxv16p0(, This Inner Loop Header: Depth=1 -; CHECK-NEXT: fmov d17, xzr -; CHECK-NEXT: cmpeq p2.d, p0/z, z17.d, #0 -; CHECK-NEXT: uzp1 p2.s, p2.s, p0.s -; CHECK-NEXT: uzp1 p2.h, p2.h, p0.h -; CHECK-NEXT: uzp1 p2.b, p2.b, p0.b -; CHECK-NEXT: mov z17.b, p2/z, #1 // =0x1 -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: whilelo p2.b, xzr, x8 -; CHECK-NEXT: not p2.b, p1/z, p2.b -; CHECK-NEXT: punpklo p3.h, p2.b -; CHECK-NEXT: punpkhi p2.h, p2.b -; CHECK-NEXT: punpklo p4.h, p3.b -; CHECK-NEXT: punpkhi p3.h, p3.b -; CHECK-NEXT: punpklo p5.h, p4.b -; CHECK-NEXT: punpkhi p4.h, p4.b -; CHECK-NEXT: st1b { z0.d }, p5, [z16.d] -; CHECK-NEXT: st1b { z1.d }, p4, [z16.d] -; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpklo p0.h, p2.b +; CHECK-NEXT: punpkhi p1.h, p2.b +; CHECK-NEXT: punpklo p2.h, p3.b ; CHECK-NEXT: punpkhi p3.h, p3.b -; CHECK-NEXT: st1b { z2.d }, p4, [z16.d] +; CHECK-NEXT: punpklo p4.h, p5.b +; CHECK-NEXT: punpkhi p5.h, p5.b +; CHECK-NEXT: punpklo p6.h, p7.b +; CHECK-NEXT: punpkhi p7.h, p7.b +; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st1b { z0.d }, p0, [z16.d] +; CHECK-NEXT: st1b { z1.d }, p1, [z16.d] +; CHECK-NEXT: st1b { z2.d }, p2, [z16.d] ; CHECK-NEXT: st1b { z3.d }, p3, [z16.d] -; CHECK-NEXT: punpklo p3.h, p2.b -; CHECK-NEXT: punpkhi p2.h, p2.b -; CHECK-NEXT: punpklo p4.h, p3.b -; CHECK-NEXT: punpkhi p3.h, p3.b ; CHECK-NEXT: st1b { z4.d }, p4, [z16.d] -; CHECK-NEXT: st1b { z5.d }, p3, [z16.d] -; CHECK-NEXT: punpklo p3.h, p2.b -; CHECK-NEXT: punpkhi p2.h, p2.b -; CHECK-NEXT: st1b { z6.d }, p3, [z16.d] -; CHECK-NEXT: st1b { z7.d }, p2, [z16.d] +; CHECK-NEXT: st1b { z5.d }, p5, [z16.d] +; CHECK-NEXT: st1b { z6.d }, p6, [z16.d] +; CHECK-NEXT: st1b { z7.d }, p7, [z16.d] ; CHECK-NEXT: b .LBB0_1 br label %1 diff --git a/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll b/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll index 6effc63ecc13ce..fe3715341a25b8 100644 --- a/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll +++ b/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll @@ -9,20 +9,20 @@ target triple = "arm64-apple-macosx13.5.0" define i32 @nsis_BZ2_bzDecompress(ptr %pos.i, i1 %cmp661.not3117.i, i1 %exitcond.not.i) { ; CHECK-LABEL: nsis_BZ2_bzDecompress: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_1: // %while.end671.i ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: strb w9, [x0] ; CHECK-NEXT: tbnz w2, #0, .LBB0_4 ; CHECK-NEXT: .LBB0_2: // %for.body653.i ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0] ; CHECK-NEXT: tbnz w1, #0, .LBB0_1 ; CHECK-NEXT: // %bb.3: // %while.body663.i ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: ldrb w9, [x9] -; CHECK-NEXT: strb wzr, [x0, x9] +; CHECK-NEXT: ldrb w10, [x8] +; CHECK-NEXT: strb wzr, [x0, x10] ; CHECK-NEXT: b .LBB0_1 ; CHECK-NEXT: .LBB0_4: // %for.end677.i ; CHECK-NEXT: mov w0, wzr diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll index 866b27b81d885f..c91de8f3a0a471 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -8,36 +8,39 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-LABEL: run_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #192 -; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: sub sp, sp, #208 +; CHECK-NEXT: .cfi_def_cfa_offset 208 ; CHECK-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #160] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #176] // 16-byte Folded Spill +; CHECK-NEXT: str x23, [sp, #160] // 8-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #176] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #192] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset b8, -40 -; CHECK-NEXT: .cfi_offset b9, -48 -; CHECK-NEXT: .cfi_offset b10, -56 -; CHECK-NEXT: .cfi_offset b11, -64 -; CHECK-NEXT: .cfi_offset b12, -72 -; CHECK-NEXT: .cfi_offset b13, -80 -; CHECK-NEXT: .cfi_offset b14, -88 -; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: .cfi_offset w23, -48 +; CHECK-NEXT: .cfi_offset b8, -56 +; CHECK-NEXT: .cfi_offset b9, -64 +; CHECK-NEXT: .cfi_offset b10, -72 +; CHECK-NEXT: .cfi_offset b11, -80 +; CHECK-NEXT: .cfi_offset b12, -88 +; CHECK-NEXT: .cfi_offset b13, -96 +; CHECK-NEXT: .cfi_offset b14, -104 +; CHECK-NEXT: .cfi_offset b15, -112 ; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: adrp x10, B+48 -; CHECK-NEXT: add x10, x10, :lo12:B+48 -; CHECK-NEXT: adrp x11, A -; CHECK-NEXT: add x11, x11, :lo12:A +; CHECK-NEXT: adrp x9, B+48 +; CHECK-NEXT: add x9, x9, :lo12:B+48 +; CHECK-NEXT: adrp x10, A +; CHECK-NEXT: add x10, x10, :lo12:A +; CHECK-NEXT: mov x11, xzr ; CHECK-NEXT: // kill: killed $q1 ; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov x12, xzr ; CHECK-NEXT: // implicit-def: $q0 ; CHECK-NEXT: // implicit-def: $q3 ; CHECK-NEXT: // implicit-def: $q4 @@ -69,103 +72,102 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: // kill: killed $q1 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str q14, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: ldr q14, [x8] -; CHECK-NEXT: mov x12, xzr -; CHECK-NEXT: ldr x14, [x12] ; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill -; CHECK-NEXT: add x19, x11, x8 -; CHECK-NEXT: fmov x15, d14 -; CHECK-NEXT: mov x16, v14.d[1] -; CHECK-NEXT: ldr q15, [x12] -; CHECK-NEXT: ldr q14, [x10], #64 +; CHECK-NEXT: ldr q15, [x8] +; CHECK-NEXT: ldr x15, [x8] +; CHECK-NEXT: str q14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: add x20, x10, x11 ; CHECK-NEXT: mov v8.16b, v28.16b -; CHECK-NEXT: fmov x13, d15 -; CHECK-NEXT: mov x18, v15.d[1] +; CHECK-NEXT: fmov x2, d15 +; CHECK-NEXT: mov x17, v15.d[1] +; CHECK-NEXT: ldr q14, [x8] ; CHECK-NEXT: mov v28.16b, v24.16b -; CHECK-NEXT: mul x17, x15, x14 -; CHECK-NEXT: mov x12, v14.d[1] -; CHECK-NEXT: fmov x4, d14 ; CHECK-NEXT: mov v24.16b, v20.16b ; CHECK-NEXT: mov v20.16b, v17.16b +; CHECK-NEXT: fmov x13, d14 +; CHECK-NEXT: mov x16, v14.d[1] ; CHECK-NEXT: mov v17.16b, v5.16b -; CHECK-NEXT: mul x1, x16, x14 +; CHECK-NEXT: mul x3, x2, x15 +; CHECK-NEXT: ldr q14, [x9], #64 ; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x5, [x8] -; CHECK-NEXT: ldr x19, [x19, #128] +; CHECK-NEXT: ldr x6, [x8] +; CHECK-NEXT: ldr x20, [x20, #128] +; CHECK-NEXT: mul x1, x17, x15 +; CHECK-NEXT: mov x14, v14.d[1] +; CHECK-NEXT: fmov x5, d14 ; CHECK-NEXT: mov v29.16b, v21.16b ; CHECK-NEXT: mov v21.16b, v0.16b -; CHECK-NEXT: mul x0, x13, x14 ; CHECK-NEXT: mov v25.16b, v6.16b +; CHECK-NEXT: mul x18, x13, x15 ; CHECK-NEXT: mov v6.16b, v2.16b -; CHECK-NEXT: fmov d15, x17 ; CHECK-NEXT: mov v26.16b, v22.16b +; CHECK-NEXT: fmov d15, x3 ; CHECK-NEXT: mov v22.16b, v18.16b -; CHECK-NEXT: mul x2, x18, x14 ; CHECK-NEXT: mov v18.16b, v7.16b +; CHECK-NEXT: mul x0, x16, x15 ; CHECK-NEXT: mov v7.16b, v3.16b ; CHECK-NEXT: mov v16.16b, v4.16b -; CHECK-NEXT: add x8, x8, #8 -; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: add x11, x11, #8 +; CHECK-NEXT: add x12, x12, #1 ; CHECK-NEXT: mov v15.d[1], x1 -; CHECK-NEXT: mul x3, x12, x14 -; CHECK-NEXT: cmp x8, #64 -; CHECK-NEXT: fmov d14, x0 -; CHECK-NEXT: mul x14, x4, x14 +; CHECK-NEXT: mul x4, x14, x15 +; CHECK-NEXT: cmp x11, #64 +; CHECK-NEXT: fmov d14, x18 +; CHECK-NEXT: mul x15, x5, x15 ; CHECK-NEXT: add v5.2d, v5.2d, v15.2d -; CHECK-NEXT: mul x20, x15, x5 -; CHECK-NEXT: mov v14.d[1], x2 -; CHECK-NEXT: mul x15, x15, x19 -; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: mul x21, x2, x6 +; CHECK-NEXT: mov v14.d[1], x0 +; CHECK-NEXT: mul x2, x2, x20 +; CHECK-NEXT: fmov d0, x15 ; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: ldr q5, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: mul x21, x13, x19 +; CHECK-NEXT: mul x22, x13, x20 ; CHECK-NEXT: add v5.2d, v5.2d, v14.2d -; CHECK-NEXT: fmov d3, x20 -; CHECK-NEXT: mul x7, x16, x5 -; CHECK-NEXT: mov v0.d[1], x3 -; CHECK-NEXT: fmov d1, x15 -; CHECK-NEXT: mul x16, x16, x19 +; CHECK-NEXT: fmov d3, x21 +; CHECK-NEXT: mul x19, x17, x6 +; CHECK-NEXT: mov v0.d[1], x4 +; CHECK-NEXT: fmov d1, x2 +; CHECK-NEXT: mul x17, x17, x20 ; CHECK-NEXT: str q5, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: add v5.2d, v13.2d, v14.2d -; CHECK-NEXT: fmov d2, x21 +; CHECK-NEXT: fmov d2, x22 ; CHECK-NEXT: ldr q13, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: mul x6, x18, x5 +; CHECK-NEXT: mul x7, x16, x6 ; CHECK-NEXT: ldp q15, q14, [sp, #16] // 32-byte Folded Reload -; CHECK-NEXT: mov v3.d[1], x7 +; CHECK-NEXT: mov v3.d[1], x19 ; CHECK-NEXT: add v13.2d, v13.2d, v0.2d -; CHECK-NEXT: mul x18, x18, x19 -; CHECK-NEXT: mov v1.d[1], x16 -; CHECK-NEXT: mul x22, x4, x19 +; CHECK-NEXT: mul x16, x16, x20 +; CHECK-NEXT: mov v1.d[1], x17 +; CHECK-NEXT: mul x23, x5, x20 ; CHECK-NEXT: str q13, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov v13.16b, v5.16b ; CHECK-NEXT: mov v5.16b, v17.16b ; CHECK-NEXT: mov v17.16b, v20.16b ; CHECK-NEXT: mov v20.16b, v24.16b -; CHECK-NEXT: mul x13, x13, x5 +; CHECK-NEXT: mul x13, x13, x6 ; CHECK-NEXT: mov v24.16b, v28.16b ; CHECK-NEXT: add v11.2d, v11.2d, v3.2d -; CHECK-NEXT: mov v2.d[1], x18 +; CHECK-NEXT: mov v2.d[1], x16 ; CHECK-NEXT: add v15.2d, v15.2d, v1.2d ; CHECK-NEXT: add v27.2d, v27.2d, v3.2d -; CHECK-NEXT: mul x17, x12, x19 +; CHECK-NEXT: mul x18, x14, x20 ; CHECK-NEXT: add v23.2d, v23.2d, v3.2d ; CHECK-NEXT: add v19.2d, v19.2d, v3.2d -; CHECK-NEXT: fmov d4, x22 +; CHECK-NEXT: fmov d4, x23 ; CHECK-NEXT: add v10.2d, v10.2d, v3.2d -; CHECK-NEXT: mul x14, x4, x5 +; CHECK-NEXT: mul x15, x5, x6 ; CHECK-NEXT: fmov d0, x13 ; CHECK-NEXT: add v14.2d, v14.2d, v2.2d ; CHECK-NEXT: add v2.2d, v6.2d, v3.2d -; CHECK-NEXT: mul x12, x12, x5 +; CHECK-NEXT: mul x14, x14, x6 ; CHECK-NEXT: mov v3.16b, v7.16b ; CHECK-NEXT: mov v7.16b, v18.16b -; CHECK-NEXT: mov v4.d[1], x17 +; CHECK-NEXT: mov v4.d[1], x18 ; CHECK-NEXT: mov v18.16b, v22.16b -; CHECK-NEXT: mov v0.d[1], x6 -; CHECK-NEXT: fmov d1, x14 +; CHECK-NEXT: mov v0.d[1], x7 +; CHECK-NEXT: fmov d1, x15 ; CHECK-NEXT: add v28.2d, v8.2d, v4.2d -; CHECK-NEXT: mov v1.d[1], x12 +; CHECK-NEXT: mov v1.d[1], x14 ; CHECK-NEXT: add v31.2d, v31.2d, v0.2d ; CHECK-NEXT: add v30.2d, v30.2d, v0.2d ; CHECK-NEXT: add v12.2d, v12.2d, v0.2d @@ -192,11 +194,12 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C ; CHECK-NEXT: stp q11, q30, [x8, #80] -; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload ; CHECK-NEXT: str q1, [x8] ; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x23, [sp, #160] // 8-byte Folded Reload ; CHECK-NEXT: stp q15, q14, [x8, #144] -; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload ; CHECK-NEXT: stp q1, q13, [x8, #16] ; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: stp q28, q12, [x8, #176] @@ -216,12 +219,13 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: stp q5, q4, [x8, #432] ; CHECK-NEXT: stp q2, q3, [x8, #464] ; CHECK-NEXT: str q0, [x8, #496] -; CHECK-NEXT: add sp, sp, #192 +; CHECK-NEXT: add sp, sp, #208 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 ; CHECK-NEXT: .cfi_restore w20 ; CHECK-NEXT: .cfi_restore w21 ; CHECK-NEXT: .cfi_restore w22 +; CHECK-NEXT: .cfi_restore w23 ; CHECK-NEXT: .cfi_restore b8 ; CHECK-NEXT: .cfi_restore b9 ; CHECK-NEXT: .cfi_restore b10 diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll index 9d4f9434aa3146..1a0fda3d54d3f4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -142,7 +142,8 @@ attributes #0 = { nounwind } ; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: -; GCN-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}} +; SDAG-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}} +; GISEL-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01cb{{$}} ; GCN-NEXT: '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: @@ -156,10 +157,10 @@ attributes #0 = { nounwind } ; GCN-NEXT: .backend_stack_size: 0x10{{$}} ; GCN-NEXT: .lds_size: 0{{$}} ; SDAG-NEXT: .sgpr_count: 0x25{{$}} -; GISEL-NEXT: .sgpr_count: 0x26{{$}} +; GISEL-NEXT: .sgpr_count: 0x27{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; SDAG-NEXT: .vgpr_count: 0x3{{$}} -; GISEL-NEXT: .vgpr_count: 0x4{{$}} +; GISEL-NEXT: .vgpr_count: 0x5{{$}} ; GCN-NEXT: multiple_stack: ; GCN-NEXT: .backend_stack_size: 0x24{{$}} ; GCN-NEXT: .lds_size: 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index bfc249e9081d22..340f0cdd5d5d07 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -245,6 +245,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI2]], %subreg.sub0, killed [[PHI3]], %subreg.sub1 ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) @@ -261,8 +262,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; SI-NEXT: {{ $}} ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]] + ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY6]] ; SI-NEXT: $vgpr0 = COPY killed [[PHI5]] ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 @@ -282,6 +282,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 ; SI-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) @@ -298,8 +299,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: successors: %bb.7(0x40000000), %bb.9(0x40000000) ; SI-NEXT: {{ $}} ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]] + ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY9]] ; SI-NEXT: $vgpr0 = COPY killed [[PHI7]] ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 @@ -367,6 +367,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, killed [[PHI2]], %subreg.sub1 ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) @@ -382,8 +383,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; SI-NEXT: {{ $}} ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]] + ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY6]] ; SI-NEXT: $vgpr0 = COPY [[COPY4]] ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 @@ -403,6 +403,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 ; SI-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) @@ -418,8 +419,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: successors: %bb.7(0x40000000), %bb.9(0x40000000) ; SI-NEXT: {{ $}} ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]] + ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY9]] ; SI-NEXT: $vgpr0 = COPY [[COPY4]] ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AVR/shift.ll b/llvm/test/CodeGen/AVR/shift.ll index c0abc77c9b14ae..55ea509a8a5b67 100644 --- a/llvm/test/CodeGen/AVR/shift.ll +++ b/llvm/test/CodeGen/AVR/shift.ll @@ -60,13 +60,13 @@ define i64 @shift_i64_i64(i64 %a, i64 %b) { ; CHECK-NEXT: breq .LBB3_3 ; CHECK-NEXT: ; %bb.1: ; %shift.loop.preheader ; CHECK-NEXT: mov r27, r1 -; CHECK-NEXT: mov r16, r1 -; CHECK-NEXT: mov r17, r1 +; CHECK-NEXT: mov r16, r27 +; CHECK-NEXT: mov r17, r27 ; CHECK-NEXT: .LBB3_2: ; %shift.loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r31, r21 ; CHECK-NEXT: lsl r31 -; CHECK-NEXT: mov r26, r1 +; CHECK-NEXT: mov r26, r27 ; CHECK-NEXT: rol r26 ; CHECK-NEXT: lsl r22 ; CHECK-NEXT: rol r23 diff --git a/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll new file mode 100644 index 00000000000000..e30bdfb939471f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 < %s -mtriple=riscv64 -mattr=+v | FileCheck %s + +declare i32 @llvm.vector.reduce.add.nxv2i32() + +define i32 @test(ptr %a, i64 %n) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: .LBB0_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vl1re32.v v9, (a0) +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: vredsum.vs v9, v9, v8 +; CHECK-NEXT: vmv.x.s a3, v9 +; CHECK-NEXT: addw a3, a3, a3 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: addi a0, a0, 8 +; CHECK-NEXT: bnez a1, .LBB0_1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %indvar = phi i64 [ 0, %entry ], [ %indvar.inc, %loop ] + %sum = phi i32 [ 0, %entry ], [ %sum.inc, %loop ] + %idx = getelementptr inbounds ptr, ptr %a, i64 %indvar + %data = load , ptr %idx + %reduce = tail call i32 @llvm.vector.reduce.add.nxv2i32( %data) + %sum.inc = add i32 %reduce, %reduce + %indvar.inc = add i64 %indvar, 1 + %cmp = icmp eq i64 %indvar.inc, %n + br i1 %cmp, label %exit, label %loop + +exit: + ret i32 %sum +} diff --git a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll index 42d6dac5b07fa3..5ced89c17c4208 100644 --- a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll +++ b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll @@ -15,27 +15,30 @@ define void @foo( %0) { ; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 +; CHECK-NEXT: .cfi_offset s2, -32 +; CHECK-NEXT: li s0, 0 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 0, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v9, v10, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv.x.s s0, v9 +; CHECK-NEXT: vmv.x.s s1, v9 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v8, v9, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv.x.s s1, v8 +; CHECK-NEXT: vmv.x.s s2, v8 ; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a2, s1 -; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: li a4, 0 -; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: jalr a1 +; CHECK-NEXT: mv a0, s1 +; CHECK-NEXT: mv a1, s0 +; CHECK-NEXT: mv a2, s2 +; CHECK-NEXT: mv a3, s0 +; CHECK-NEXT: mv a4, s0 +; CHECK-NEXT: mv a5, s0 +; CHECK-NEXT: jalr s0 ; CHECK-NEXT: j .LBB0_1 %2 = tail call @llvm.vector.insert.nxv8i8.v16i8( undef, <16 x i8> undef, i64 0) %3 = tail call @llvm.vector.insert.nxv8i8.v16i8( undef, <16 x i8> poison, i64 0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll index 9da4d7ec9f2d05..4aa26d6b79ca46 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll @@ -11,22 +11,22 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) { ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: addi a3, a2, 1 -; RV32-NEXT: addi a4, a0, 1 +; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vsetvli zero, a3, e8, mf2, tu, ma +; RV32-NEXT: vslideup.vx v8, v9, a2 +; RV32-NEXT: addi a2, a0, 1 ; RV32-NEXT: .LBB0_1: # %for.body ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: th.lrb a0, a1, a0, 0 -; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vmv1r.v v10, v8 -; RV32-NEXT: vsetvli zero, a3, e8, mf2, tu, ma -; RV32-NEXT: vslideup.vx v10, v9, a2 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, tu, ma -; RV32-NEXT: vmv.s.x v10, a0 -; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32-NEXT: vmseq.vi v9, v10, 0 +; RV32-NEXT: vmv1r.v v9, v8 +; RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmseq.vi v9, v9, 0 ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: andi a5, a0, 255 -; RV32-NEXT: mv a0, a4 -; RV32-NEXT: bnez a5, .LBB0_1 +; RV32-NEXT: andi a3, a0, 255 +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: bnez a3, .LBB0_1 ; RV32-NEXT: # %bb.2: # %if.then381 ; RV32-NEXT: li a0, 0 ; RV32-NEXT: ret @@ -37,23 +37,23 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) { ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v8, a3 ; RV64-NEXT: addi a3, a2, 1 -; RV64-NEXT: addi a4, a0, 1 +; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vsetvli zero, a3, e8, mf2, tu, ma +; RV64-NEXT: vslideup.vx v8, v9, a2 +; RV64-NEXT: addi a2, a0, 1 ; RV64-NEXT: .LBB0_1: # %for.body ; RV64-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: th.lrb a0, a1, a0, 0 -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vmv1r.v v10, v8 -; RV64-NEXT: vsetvli zero, a3, e8, mf2, tu, ma -; RV64-NEXT: vslideup.vx v10, v9, a2 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, tu, ma -; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV64-NEXT: vmseq.vi v9, v10, 0 +; RV64-NEXT: vmv1r.v v9, v8 +; RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vmseq.vi v9, v9, 0 ; RV64-NEXT: vmv.x.s a0, v9 -; RV64-NEXT: andi a5, a0, 255 -; RV64-NEXT: mv a0, a4 -; RV64-NEXT: bnez a5, .LBB0_1 +; RV64-NEXT: andi a3, a0, 255 +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: bnez a3, .LBB0_1 ; RV64-NEXT: # %bb.2: # %if.then381 ; RV64-NEXT: li a0, 0 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/vlenb.ll b/llvm/test/CodeGen/RISCV/vlenb.ll index 1d6c1b5d1acbdc..26d4f99c3b9792 100644 --- a/llvm/test/CodeGen/RISCV/vlenb.ll +++ b/llvm/test/CodeGen/RISCV/vlenb.ll @@ -71,10 +71,13 @@ define void @machine_licm() { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -4 +; CHECK-NEXT: .cfi_offset s0, -8 +; CHECK-NEXT: csrr s0, vlenb ; CHECK-NEXT: .LBB4_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: mv a0, s0 ; CHECK-NEXT: call use ; CHECK-NEXT: j .LBB4_1 entry: From 476a6d81a3648cf638400632c098e9f0ed025f8f Mon Sep 17 00:00:00 2001 From: MagentaTreehouse <99200384+MagentaTreehouse@users.noreply.github.com> Date: Wed, 29 May 2024 02:12:26 -0400 Subject: [PATCH 064/230] [NFC] Construct Twines before concatenation (#90728) Construct `Twine`s before concatenation. --- clang/lib/Driver/ToolChains/HIPUtility.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp index b1ff697b368b13..f32a23f111e4bf 100644 --- a/clang/lib/Driver/ToolChains/HIPUtility.cpp +++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp @@ -106,9 +106,9 @@ class HIPUndefinedFatBinSymbols { std::string ID = IA->getId().str(); if (!ID.empty()) { ID = llvm::utohexstr(llvm::MD5Hash(ID), /*LowerCase=*/true); - FatBinSymbols.insert(Twine(FatBinPrefix + "_" + ID).str()); + FatBinSymbols.insert((FatBinPrefix + Twine('_') + ID).str()); GPUBinHandleSymbols.insert( - Twine(GPUBinHandlePrefix + "_" + ID).str()); + (GPUBinHandlePrefix + Twine('_') + ID).str()); continue; } if (IA->getInputArg().getNumValues() == 0) From 7f58ffd09b29d3ff4f9fa025bd4d05dd8fd9fc38 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Wed, 29 May 2024 08:43:13 +0200 Subject: [PATCH 065/230] [mlir][python] Yield results of `scf.for_` (#93610) Using `for_` is very hand with python bindings. Currently, it doesn't support results, we had to fallback to two lines scf.for. This PR yields results of scf.for in `for_` --------- Co-authored-by: Maksim Levental --- mlir/python/mlir/dialects/scf.py | 4 +-- mlir/test/python/dialects/scf.py | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py index dad7377987e56c..7025f6e0f1a166 100644 --- a/mlir/python/mlir/dialects/scf.py +++ b/mlir/python/mlir/dialects/scf.py @@ -132,8 +132,8 @@ def for_( iter_args = tuple(for_op.inner_iter_args) with InsertionPoint(for_op.body): if len(iter_args) > 1: - yield iv, iter_args + yield iv, iter_args, for_op.results elif len(iter_args) == 1: - yield iv, iter_args[0] + yield iv, iter_args[0], for_op.results[0] else: yield iv diff --git a/mlir/test/python/dialects/scf.py b/mlir/test/python/dialects/scf.py index ee8d09aa301d98..95a6de86b670d5 100644 --- a/mlir/test/python/dialects/scf.py +++ b/mlir/test/python/dialects/scf.py @@ -176,6 +176,56 @@ def range_loop_7(lb, ub, step, memref_v): memref.store(add, memref_v, [i]) scf.yield_([]) + # CHECK: func.func @loop_yield_1(%[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: memref<10xindex>) { + # CHECK: %[[VAL_4:.*]] = arith.constant 0 : index + # CHECK: %[[VAL_5:.*]] = arith.constant 0 : index + # CHECK: %[[VAL_6:.*]] = arith.constant 0 : index + # CHECK: %[[VAL_7:.*]] = arith.constant 100 : index + # CHECK: %[[VAL_8:.*]] = arith.constant 1 : index + # CHECK: %[[VAL_10:.*]] = scf.for %[[IV:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_8]] iter_args(%[[ITER:.*]] = %[[VAL_4]]) -> (index) { + # CHECK: %[[VAL_9:.*]] = arith.addi %[[ITER]], %[[IV]] : index + # CHECK: scf.yield %[[VAL_9]] : index + # CHECK: } + # CHECK: memref.store %[[VAL_10]], %[[VAL_3]]{{\[}}%[[VAL_5]]] : memref<10xindex> + # CHECK: return + # CHECK: } + @func.FuncOp.from_py_func(index_type, index_type, index_type, memref_t) + def loop_yield_1(lb, ub, step, memref_v): + sum = arith.ConstantOp.create_index(0) + c0 = arith.ConstantOp.create_index(0) + for i, loc_sum, sum in scf.for_(0, 100, 1, [sum]): + loc_sum = arith.addi(loc_sum, i) + scf.yield_([loc_sum]) + memref.store(sum, memref_v, [c0]) + + # CHECK: func.func @loop_yield_2(%[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: memref<10xindex>) { + # CHECK: %[[c0:.*]] = arith.constant 0 : index + # CHECK: %[[c2:.*]] = arith.constant 2 : index + # CHECK: %[[REF1:.*]] = arith.constant 0 : index + # CHECK: %[[REF2:.*]] = arith.constant 1 : index + # CHECK: %[[VAL_6:.*]] = arith.constant 0 : index + # CHECK: %[[VAL_7:.*]] = arith.constant 100 : index + # CHECK: %[[VAL_8:.*]] = arith.constant 1 : index + # CHECK: %[[RES:.*]] = scf.for %[[IV:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_8]] iter_args(%[[ITER1:.*]] = %[[c0]], %[[ITER2:.*]] = %[[c2]]) -> (index, index) { + # CHECK: %[[VAL_9:.*]] = arith.addi %[[ITER1]], %[[IV]] : index + # CHECK: %[[VAL_10:.*]] = arith.addi %[[ITER2]], %[[IV]] : index + # CHECK: scf.yield %[[VAL_9]], %[[VAL_10]] : index, index + # CHECK: } + # CHECK: return + # CHECK: } + @func.FuncOp.from_py_func(index_type, index_type, index_type, memref_t) + def loop_yield_2(lb, ub, step, memref_v): + sum1 = arith.ConstantOp.create_index(0) + sum2 = arith.ConstantOp.create_index(2) + c0 = arith.ConstantOp.create_index(0) + c1 = arith.ConstantOp.create_index(1) + for i, [loc_sum1, loc_sum2], [sum1, sum2] in scf.for_(0, 100, 1, [sum1, sum2]): + loc_sum1 = arith.addi(loc_sum1, i) + loc_sum2 = arith.addi(loc_sum2, i) + scf.yield_([loc_sum1, loc_sum2]) + memref.store(sum1, memref_v, [c0]) + memref.store(sum2, memref_v, [c1]) + @constructAndPrintInModule def testOpsAsArguments(): From c2a9a974ca85e4ac4509e368d4b9acae7e67bf71 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Sat, 18 May 2024 16:46:03 +0200 Subject: [PATCH 066/230] [LICM] Introduce test for PR92655 (NFC) --- .../LICM/update-scev-after-hoist.ll | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 llvm/test/Transforms/LICM/update-scev-after-hoist.ll diff --git a/llvm/test/Transforms/LICM/update-scev-after-hoist.ll b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll new file mode 100644 index 00000000000000..f834a74b6f247c --- /dev/null +++ b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll @@ -0,0 +1,24 @@ +; RUN: opt -S -passes='loop-unroll,loop-mssa(licm),print' -unroll-count=4 -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCEV-EXPR + +define i16 @main() { +; SCEV-EXPR: Classifying expressions for: @main +; SCEV-EXPR-NEXT: %mul = phi i16 [ 1, %entry ], [ %mul.n.3, %loop ] +; SCEV-EXPR-NEXT: --> %mul U: full-set S: [-32768,32753) Exits: 4096 LoopDispositions: { %loop: Variant } +; SCEV-EXPR-NEXT: %div = phi i16 [ 32767, %entry ], [ %div.n.3, %loop ] +; SCEV-EXPR-NEXT: --> %div U: [-2048,-32768) S: [-2048,-32768) Exits: 7 LoopDispositions: { %loop: Variant } +; SCEV-EXPR-NEXT: %mul.n = mul i16 %mul, 8 +; SCEV-EXPR-NEXT: --> (2 * %mul) U: [0,-1) S: [-32768,32767) Exits: 8192 LoopDispositions: { %loop: Variant } +entry: + br label %loop + +loop: + %mul = phi i16 [ 1, %entry ], [ %mul.n, %loop ] + %div = phi i16 [ 32767, %entry ], [ %div.n, %loop ] + %mul.n = mul i16 %mul, 2 + %div.n = sdiv i16 %div, 2 + %cmp = icmp sgt i16 %div, 0 + br i1 %cmp, label %loop, label %end + +end: + ret i16 %mul +} From 70091dc943ade280d75cea1e5ea5e93d9a8f934a Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Sat, 18 May 2024 16:52:17 +0200 Subject: [PATCH 067/230] [LICM] Invalidate cached SCEV results in `hoistMulAddAssociation` While reassociating expressions, LICM is required to invalidate SCEV results, as otherwise subsequent passes in the pipeline that leverage LICM foldings (e.g. IndVars), may reason on invalid expressions; thus miscompiling. This is achieved by rewriting the reassociable instruction from scratch. Fixes: https://github.com/llvm/llvm-project/issues/91957. --- llvm/lib/Transforms/Scalar/LICM.cpp | 16 ++++++++++++++-- .../Transforms/LICM/update-scev-after-hoist.ll | 6 +++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 6aa4188d1cc4d4..5eccf7b4adb65e 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2751,7 +2751,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L, IRBuilder<> Builder(Preheader->getTerminator()); for (auto *U : Changes) { assert(L.isLoopInvariant(U->get())); - Instruction *Ins = cast(U->getUser()); + auto *Ins = cast(U->getUser()); Value *Mul; if (I.getType()->isIntOrIntVectorTy()) { Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul"); @@ -2759,8 +2759,20 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L, Ins->dropPoisonGeneratingFlags(); } else Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul"); - U->set(Mul); + + // Rewrite the reassociable instruction. + unsigned OpIdx = U->getOperandNo(); + auto *LHS = OpIdx == 0 ? Mul : Ins->getOperand(0); + auto *RHS = OpIdx == 1 ? Mul : Ins->getOperand(1); + auto *NewBO = BinaryOperator::Create(Ins->getOpcode(), LHS, RHS, + Ins->getName() + ".reass", Ins); + NewBO->copyIRFlags(Ins); + if (VariantOp == Ins) + VariantOp = NewBO; + Ins->replaceAllUsesWith(NewBO); + eraseInstruction(*Ins, SafetyInfo, MSSAU); } + I.replaceAllUsesWith(VariantOp); eraseInstruction(I, SafetyInfo, MSSAU); return true; diff --git a/llvm/test/Transforms/LICM/update-scev-after-hoist.ll b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll index f834a74b6f247c..fc45b8fce1766a 100644 --- a/llvm/test/Transforms/LICM/update-scev-after-hoist.ll +++ b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll @@ -3,11 +3,11 @@ define i16 @main() { ; SCEV-EXPR: Classifying expressions for: @main ; SCEV-EXPR-NEXT: %mul = phi i16 [ 1, %entry ], [ %mul.n.3, %loop ] -; SCEV-EXPR-NEXT: --> %mul U: full-set S: [-32768,32753) Exits: 4096 LoopDispositions: { %loop: Variant } +; SCEV-EXPR-NEXT: --> %mul U: [0,-15) S: [-32768,32753) Exits: 4096 LoopDispositions: { %loop: Variant } ; SCEV-EXPR-NEXT: %div = phi i16 [ 32767, %entry ], [ %div.n.3, %loop ] ; SCEV-EXPR-NEXT: --> %div U: [-2048,-32768) S: [-2048,-32768) Exits: 7 LoopDispositions: { %loop: Variant } -; SCEV-EXPR-NEXT: %mul.n = mul i16 %mul, 8 -; SCEV-EXPR-NEXT: --> (2 * %mul) U: [0,-1) S: [-32768,32767) Exits: 8192 LoopDispositions: { %loop: Variant } +; SCEV-EXPR-NEXT: %mul.n.reass.reass = mul i16 %mul, 8 +; SCEV-EXPR-NEXT: --> (8 * %mul) U: [0,-7) S: [-32768,32761) Exits: -32768 LoopDispositions: { %loop: Variant } entry: br label %loop From 53d79feec93ef99e2ba0ac8cfc6cf2f81d28bf8a Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 29 May 2024 09:17:24 +0200 Subject: [PATCH 068/230] [lldb/DWARF] Bypass the compres^Wconstruction of DIERefs in debug_names (#93296) DebugNamesDWARFIndex was jumping through hoops to construct a DIERef from an index entry only to jump through them back a short while later to construct a DWARFDIE. This used to be necessary as the index lookup was a two stage process, where we first enumerated all matches, and then examined them (so it was important that the enumeration was cheap -- does not trigger unnecessary parsing). However, now that the processing is callback based, we are always immediately examining the DWARFDIE right after finding the entry, and the DIERef just gets in the way. --- .../SymbolFile/DWARF/AppleDWARFIndex.cpp | 8 ++- .../Plugins/SymbolFile/DWARF/DWARFIndex.cpp | 7 +-- .../Plugins/SymbolFile/DWARF/DWARFIndex.h | 9 ++- .../SymbolFile/DWARF/DebugNamesDWARFIndex.cpp | 58 ++++++++----------- .../SymbolFile/DWARF/DebugNamesDWARFIndex.h | 2 +- 5 files changed, 37 insertions(+), 47 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp index 33537df4f50762..1703597a7cd2fd 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp @@ -284,8 +284,12 @@ void AppleDWARFIndex::GetFunctions( for (const auto &entry : m_apple_names_up->equal_range(name)) { DIERef die_ref(std::nullopt, DIERef::Section::DebugInfo, *entry.getDIESectionOffset()); - if (!ProcessFunctionDIE(lookup_info, die_ref, dwarf, parent_decl_ctx, - callback)) + DWARFDIE die = dwarf.GetDIE(die_ref); + if (!die) { + ReportInvalidDIERef(die_ref, name); + continue; + } + if (!ProcessFunctionDIE(lookup_info, die, parent_decl_ctx, callback)) return; } } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp index 20c07a94b50769..30fb5d5ebdb0df 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp @@ -24,16 +24,11 @@ using namespace lldb_private::plugin::dwarf; DWARFIndex::~DWARFIndex() = default; bool DWARFIndex::ProcessFunctionDIE( - const Module::LookupInfo &lookup_info, DIERef ref, SymbolFileDWARF &dwarf, + const Module::LookupInfo &lookup_info, DWARFDIE die, const CompilerDeclContext &parent_decl_ctx, llvm::function_ref callback) { llvm::StringRef name = lookup_info.GetLookupName().GetStringRef(); FunctionNameType name_type_mask = lookup_info.GetNameTypeMask(); - DWARFDIE die = dwarf.GetDIE(ref); - if (!die) { - ReportInvalidDIERef(ref, name); - return true; - } if (!(name_type_mask & eFunctionNameTypeFull)) { ConstString name_to_match_against; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h index 0551b07100a96b..cb3ae8a06d7885 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h @@ -81,11 +81,10 @@ class DWARFIndex { StatsDuration m_index_time; /// Helper function implementing common logic for processing function dies. If - /// the function given by "ref" matches search criteria given by - /// "parent_decl_ctx" and "name_type_mask", it is inserted into the "dies" - /// vector. - bool ProcessFunctionDIE(const Module::LookupInfo &lookup_info, DIERef ref, - SymbolFileDWARF &dwarf, + /// the function given by "die" matches search criteria given by + /// "parent_decl_ctx" and "name_type_mask", it calls the callback with the + /// given die. + bool ProcessFunctionDIE(const Module::LookupInfo &lookup_info, DWARFDIE die, const CompilerDeclContext &parent_decl_ctx, llvm::function_ref callback); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp index c98e5481609dea..56717bab1ecd86 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp @@ -64,27 +64,25 @@ DebugNamesDWARFIndex::GetNonSkeletonUnit(const DebugNames::Entry &entry) const { return cu ? &cu->GetNonSkeletonUnit() : nullptr; } -std::optional -DebugNamesDWARFIndex::ToDIERef(const DebugNames::Entry &entry) const { +DWARFDIE DebugNamesDWARFIndex::GetDIE(const DebugNames::Entry &entry) const { DWARFUnit *unit = GetNonSkeletonUnit(entry); - if (!unit) - return std::nullopt; - if (std::optional die_offset = entry.getDIEUnitOffset()) - return DIERef(unit->GetSymbolFileDWARF().GetFileIndex(), - DIERef::Section::DebugInfo, unit->GetOffset() + *die_offset); - - return std::nullopt; + std::optional die_offset = entry.getDIEUnitOffset(); + if (!unit || !die_offset) + return DWARFDIE(); + if (DWARFDIE die = unit->GetDIE(unit->GetOffset() + *die_offset)) + return die; + + m_module.ReportErrorIfModifyDetected( + "the DWARF debug information has been modified (bad offset {0:x} in " + "debug_names section)\n", + *die_offset); + return DWARFDIE(); } bool DebugNamesDWARFIndex::ProcessEntry( const DebugNames::Entry &entry, llvm::function_ref callback) { - std::optional ref = ToDIERef(entry); - if (!ref) - return true; - SymbolFileDWARF &dwarf = *llvm::cast( - m_module.GetSymbolFile()->GetBackingSymbolFile()); - DWARFDIE die = dwarf.GetDIE(*ref); + DWARFDIE die = GetDIE(entry); if (!die) return true; // Clang erroneously emits index entries for declaration DIEs in case when the @@ -187,7 +185,7 @@ void DebugNamesDWARFIndex::GetCompleteObjCClass( llvm::function_ref callback) { // Keep a list of incomplete types as fallback for when we don't find the // complete type. - DIEArray incomplete_types; + std::vector incomplete_types; for (const DebugNames::Entry &entry : m_debug_names_up->equal_range(class_name.GetStringRef())) { @@ -195,19 +193,14 @@ void DebugNamesDWARFIndex::GetCompleteObjCClass( entry.tag() != DW_TAG_class_type) continue; - std::optional ref = ToDIERef(entry); - if (!ref) - continue; - - DWARFUnit *cu = m_debug_info.GetUnit(*ref); - if (!cu || !cu->Supports_DW_AT_APPLE_objc_complete_type()) { - incomplete_types.push_back(*ref); + DWARFDIE die = GetDIE(entry); + if (!die) { + // Report invalid continue; } - - DWARFDIE die = m_debug_info.GetDIE(*ref); - if (!die) { - ReportInvalidDIERef(*ref, class_name.GetStringRef()); + DWARFUnit *cu = die.GetCU(); + if (!cu->Supports_DW_AT_APPLE_objc_complete_type()) { + incomplete_types.push_back(die); continue; } @@ -216,12 +209,11 @@ void DebugNamesDWARFIndex::GetCompleteObjCClass( callback(die); return; } - incomplete_types.push_back(*ref); + incomplete_types.push_back(die); } - auto dierefcallback = DIERefCallback(callback, class_name.GetStringRef()); - for (DIERef ref : incomplete_types) - if (!dierefcallback(ref)) + for (DWARFDIE die : incomplete_types) + if (!callback(die)) return; m_fallback.GetCompleteObjCClass(class_name, must_be_implementation, callback); @@ -383,8 +375,8 @@ void DebugNamesDWARFIndex::GetFunctions( if (tag != DW_TAG_subprogram && tag != DW_TAG_inlined_subroutine) continue; - if (std::optional ref = ToDIERef(entry)) { - if (!ProcessFunctionDIE(lookup_info, *ref, dwarf, parent_decl_ctx, + if (DWARFDIE die = GetDIE(entry)) { + if (!ProcessFunctionDIE(lookup_info, die, parent_decl_ctx, [&](DWARFDIE die) { if (!seen.insert(die.GetDIE()).second) return true; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h index 81fb8f88b805af..a27a414ecdd193 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h @@ -84,7 +84,7 @@ class DebugNamesDWARFIndex : public DWARFIndex { ManualDWARFIndex m_fallback; DWARFUnit *GetNonSkeletonUnit(const DebugNames::Entry &entry) const; - std::optional ToDIERef(const DebugNames::Entry &entry) const; + DWARFDIE GetDIE(const DebugNames::Entry &entry) const; bool ProcessEntry(const DebugNames::Entry &entry, llvm::function_ref callback); From 2cfea14a57ad8443c6898d2310abb4346dc92ad2 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 29 May 2024 09:27:32 +0200 Subject: [PATCH 069/230] [lldb-dap] Add timestamps to protocol logs (#93540) I've found them very useful as a rudimentary form of benchmark. --- lldb/tools/lldb-dap/DAP.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index c7eb3db4304a90..d419f821999e6c 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -103,7 +103,9 @@ void DAP::SendJSON(const llvm::json::Value &json) { SendJSON(json_str); if (log) { - *log << "<-- " << std::endl + auto now = std::chrono::duration( + std::chrono::system_clock::now().time_since_epoch()); + *log << llvm::formatv("{0:f9} <-- ", now.count()).str() << std::endl << "Content-Length: " << json_str.size() << "\r\n\r\n" << llvm::formatv("{0:2}", json).str() << std::endl; } @@ -130,9 +132,12 @@ std::string DAP::ReadJSON() { if (!input.read_full(log.get(), length, json_str)) return json_str; - if (log) - *log << "--> " << std::endl << "Content-Length: " << length << "\r\n\r\n"; - + if (log) { + auto now = std::chrono::duration( + std::chrono::system_clock::now().time_since_epoch()); + *log << llvm::formatv("{0:f9} --> ", now.count()).str() << std::endl + << "Content-Length: " << length << "\r\n\r\n"; + } return json_str; } From 98714866830f505d7bb87de6b92a28f280a34b9b Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Fri, 22 Mar 2024 12:04:58 -0500 Subject: [PATCH 070/230] [InstCombine] Add multiuse tests for canonicalizing (icmp eq/ne (and x, y), {x,y}); NFC --- .../test/Transforms/InstCombine/icmp-of-and-x.ll | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll index e95c72b75f97df..75070e5a34f949 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll @@ -3,6 +3,7 @@ declare i1 @barrier() declare void @llvm.assume(i1) +declare void @use.i8(i8) define i1 @icmp_ult_x_y(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_ult_x_y( @@ -262,6 +263,21 @@ define i1 @icmp_eq_x_invertable_y(i8 %x, i8 %y) { ret i1 %r } +define i1 @icmp_eq_x_invertable_y_fail_multiuse(i8 %x, i8 %y) { +; CHECK-LABEL: @icmp_eq_x_invertable_y_fail_multiuse( +; CHECK-NEXT: [[YY:%.*]] = xor i8 [[Y:%.*]], -1 +; CHECK-NEXT: [[AND:%.*]] = and i8 [[YY]], [[X:%.*]] +; CHECK-NEXT: call void @use.i8(i8 [[AND]]) +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], [[X]] +; CHECK-NEXT: ret i1 [[R]] +; + %yy = xor i8 %y, -1 + %and = and i8 %x, %yy + call void @use.i8(i8 %and) + %r = icmp eq i8 %x, %and + ret i1 %r +} + define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y2_todo( ; CHECK-NEXT: [[YY:%.*]] = select i1 [[Y:%.*]], i8 7, i8 24 From 5532ab17327f2887fdac739ffaaae6c341695370 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Wed, 13 Sep 2023 13:45:58 -0500 Subject: [PATCH 071/230] [InstCombine] Make the `(icmp eq/ne (and X, Y), X)` canonicalization work for non-const operands We currently do: `(icmp eq/ne (and X, Y), Y)` -> `(icmp eq/ne (and ~X, Y), 0)` if `X` is constant. We can make this more general and do it if `X` is freely invertable (i.e say `X = ~Z`). As well, we can also do: `(icmp eq/ne (and X, Y), Y)` -> `(icmp eq/ne (or X, ~Y), -1)` If `Y` is freely invertible. Proofs: https://alive2.llvm.org/ce/z/yeWH3E Differential Revision: https://reviews.llvm.org/D159059 Closes #84688 --- .../InstCombine/InstCombineCompares.cpp | 30 +++++----- .../ValueTracking/known-power-of-two-urem.ll | 18 +++--- ...low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll | 17 +++--- ...low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll | 17 +++--- ...low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll | 8 +-- ...low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll | 8 +-- .../InstCombine/icmp-and-lowbit-mask.ll | 59 +++++++++---------- .../Transforms/InstCombine/icmp-of-and-x.ll | 20 +++---- 8 files changed, 88 insertions(+), 89 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index c3272d97509f53..89193f8ff94b6e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -4730,6 +4730,21 @@ static Instruction *foldICmpAndXX(ICmpInst &I, const SimplifyQuery &Q, if (Pred == ICmpInst::ICMP_UGE) return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1); + if (ICmpInst::isEquality(Pred) && Op0->hasOneUse()) { + // icmp (X & Y) eq/ne Y --> (X | ~Y) eq/ne -1 if Y is freely invertible and + // Y is non-constant. If Y is constant the `X & C == C` form is preferable + // so don't do this fold. + if (!match(Op1, m_ImmConstant())) + if (auto *NotOp1 = + IC.getFreelyInverted(Op1, !Op1->hasNUsesOrMore(3), &IC.Builder)) + return new ICmpInst(Pred, IC.Builder.CreateOr(A, NotOp1), + Constant::getAllOnesValue(Op1->getType())); + // icmp (X & Y) eq/ne Y --> (~X & Y) eq/ne 0 if X is freely invertible. + if (auto *NotA = IC.getFreelyInverted(A, A->hasOneUse(), &IC.Builder)) + return new ICmpInst(Pred, IC.Builder.CreateAnd(Op1, NotA), + Constant::getNullValue(Op1->getType())); + } + return nullptr; } @@ -5505,21 +5520,6 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) { } } - // canoncalize: - // (icmp eq/ne (and X, C), X) - // -> (icmp eq/ne (and X, ~C), 0) - { - Constant *CMask; - A = nullptr; - if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_ImmConstant(CMask))))) - A = Op1; - else if (match(Op1, m_OneUse(m_And(m_Specific(Op0), m_ImmConstant(CMask))))) - A = Op0; - if (A) - return new ICmpInst(Pred, Builder.CreateAnd(A, Builder.CreateNot(CMask)), - Constant::getNullValue(A->getType())); - } - if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) { // A == (A^B) -> B == 0 Value *OtherVal = A == Op0 ? B : A; diff --git a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll index 47c4587f6991bd..ba3a484441e9e3 100644 --- a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll +++ b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll @@ -428,9 +428,9 @@ define i8 @known_power_of_two_lshr_add_one_allow_zero(i8 %x, i8 %y) { define i1 @known_power_of_two_lshr_add_one_nuw_deny_zero(i8 %x, i8 %y) { ; CHECK-LABEL: @known_power_of_two_lshr_add_one_nuw_deny_zero( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]] -; CHECK-NEXT: [[P:%.*]] = add nuw i8 [[TMP1]], 1 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[P]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = sub i8 -2, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP3]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %4 = lshr i8 -1, %x @@ -445,9 +445,9 @@ define i1 @known_power_of_two_lshr_add_one_nuw_deny_zero(i8 %x, i8 %y) { define i1 @negative_known_power_of_two_lshr_add_one_deny_zero(i8 %x, i8 %y) { ; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_deny_zero( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]] -; CHECK-NEXT: [[P:%.*]] = add i8 [[TMP1]], 1 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[P]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[AND]], [[P]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i8 -2, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP3]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %4 = lshr i8 -1, %x @@ -462,9 +462,9 @@ define i1 @negative_known_power_of_two_lshr_add_one_deny_zero(i8 %x, i8 %y) { define i1 @negative_known_power_of_two_lshr_add_one_nsw_deny_zero(i8 %x, i8 %y) { ; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_nsw_deny_zero( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]] -; CHECK-NEXT: [[P:%.*]] = add nsw i8 [[TMP1]], 1 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[P]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[AND]], [[P]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i8 -2, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP3]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %4 = lshr i8 -1, %x diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll index 88487b38e2c708..0a7de501ca0225 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll @@ -144,7 +144,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-LABEL: @oneuse0( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[X_HIGHBITS:%.*]] = lshr i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: [[X_HIGHBITS:%.*]] = and i8 [[T0]], [[X:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[X_HIGHBITS]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -161,7 +161,8 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -195,7 +196,8 @@ define i1 @oneuse3(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -269,9 +271,8 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) { define i1 @n1(i8 %x, i8 %y) { ; CHECK-LABEL: @n1( ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[T2]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y ; not -1 @@ -284,9 +285,9 @@ define i1 @n1(i8 %x, i8 %y) { define i1 @n2(i8 %x, i8 %y) { ; CHECK-LABEL: @n2( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], 1 +; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -2 ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[T2]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[T2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll index b717925fd644fc..54ff87676e71d9 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll @@ -144,7 +144,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-LABEL: @oneuse0( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[X_HIGHBITS:%.*]] = lshr i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: [[X_HIGHBITS:%.*]] = and i8 [[T0]], [[X:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[X_HIGHBITS]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -161,7 +161,8 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -195,7 +196,8 @@ define i1 @oneuse3(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -269,9 +271,8 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) { define i1 @n1(i8 %x, i8 %y) { ; CHECK-LABEL: @n1( ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[T2]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y ; not -1 @@ -284,9 +285,9 @@ define i1 @n1(i8 %x, i8 %y) { define i1 @n2(i8 %x, i8 %y) { ; CHECK-LABEL: @n2( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], 1 +; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -2 ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[T2]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[T2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll index a65be1e9ceeca3..c7c57b601eab38 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll @@ -251,9 +251,9 @@ define i1 @n1(i8 %x, i8 %y) { ; CHECK-LABEL: @n1( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0]], -1 +; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[T0]] ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[T2]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[T2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y ; not 1 @@ -268,9 +268,9 @@ define i1 @n2(i8 %x, i8 %y) { ; CHECK-LABEL: @n2( ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = add nuw i8 [[T0]], 1 +; CHECK-NEXT: [[T1:%.*]] = sub nuw i8 -2, [[T0]] ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[T2]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[T2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll index f156d9bf007cbb..d5826524f1637c 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll @@ -251,9 +251,9 @@ define i1 @n1(i8 %x, i8 %y) { ; CHECK-LABEL: @n1( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0]], -1 +; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[T0]] ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[T2]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[T2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y ; not 1 @@ -268,9 +268,9 @@ define i1 @n2(i8 %x, i8 %y) { ; CHECK-LABEL: @n2( ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = add nuw i8 [[T0]], 1 +; CHECK-NEXT: [[T1:%.*]] = sub nuw i8 -2, [[T0]] ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[T2]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[T2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y diff --git a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll index 5de3e89d7027ab..8bb7fd0e522cb0 100644 --- a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll +++ b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll @@ -5,9 +5,9 @@ declare void @use.i8(i8) declare void @use.i16(i16) define i1 @src_is_mask_zext(i16 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_zext( -; CHECK-NEXT: [[X:%.*]] = xor i16 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[M_IN:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = zext i8 [[M_IN]] to i16 +; CHECK-NEXT: [[X:%.*]] = xor i16 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ule i16 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -22,11 +22,11 @@ define i1 @src_is_mask_zext(i16 %x_in, i8 %y) { define i1 @src_is_mask_zext_fail_not_mask(i16 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_zext_fail_not_mask( -; CHECK-NEXT: [[X:%.*]] = xor i16 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[M_IN:%.*]] = lshr i8 -2, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = zext i8 [[M_IN]] to i16 -; CHECK-NEXT: [[AND:%.*]] = and i16 [[X]], [[MASK]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], -124 +; CHECK-NEXT: [[TMP2:%.*]] = or i16 [[TMP1]], [[MASK]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[TMP2]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i16 %x_in, 123 @@ -80,10 +80,10 @@ define i1 @src_is_mask_sext_fail_multiuse(i16 %x_in, i8 %y) { define i1 @src_is_mask_and(i8 %x_in, i8 %y, i8 %z) { ; CHECK-LABEL: @src_is_mask_and( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[MY:%.*]] = lshr i8 7, [[Y:%.*]] ; CHECK-NEXT: [[MZ:%.*]] = lshr i8 -1, [[Z:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = and i8 [[MY]], [[MZ]] +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -99,12 +99,12 @@ define i1 @src_is_mask_and(i8 %x_in, i8 %y, i8 %z) { define i1 @src_is_mask_and_fail_mixed(i8 %x_in, i8 %y, i8 %z) { ; CHECK-LABEL: @src_is_mask_and_fail_mixed( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[MY:%.*]] = ashr i8 -8, [[Y:%.*]] ; CHECK-NEXT: [[MZ:%.*]] = lshr i8 -1, [[Z:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = and i8 [[MY]], [[MZ]] -; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], [[MASK]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[X]], [[AND]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124 +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[MASK]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -119,9 +119,9 @@ define i1 @src_is_mask_and_fail_mixed(i8 %x_in, i8 %y, i8 %z) { define i1 @src_is_mask_or(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_or( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[MY:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = and i8 [[MY]], 7 +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -136,9 +136,9 @@ define i1 @src_is_mask_or(i8 %x_in, i8 %y) { define i1 @src_is_mask_xor(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_xor( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[MASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -152,11 +152,11 @@ define i1 @src_is_mask_xor(i8 %x_in, i8 %y) { define i1 @src_is_mask_xor_fail_notmask(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_xor_fail_notmask( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[Y:%.*]] ; CHECK-NEXT: [[NOTMASK:%.*]] = xor i8 [[TMP1]], [[Y]] -; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], [[NOTMASK]] -; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[AND]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[X_IN:%.*]], -124 +; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[NOTMASK]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP3]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -170,10 +170,10 @@ define i1 @src_is_mask_xor_fail_notmask(i8 %x_in, i8 %y) { define i1 @src_is_mask_select(i8 %x_in, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_is_mask_select( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -245,11 +245,11 @@ define i1 @src_is_mask_shl_lshr_fail_not_allones(i8 %x_in, i8 %y, i1 %cond) { define i1 @src_is_mask_lshr(i8 %x_in, i8 %y, i8 %z, i1 %cond) { ; CHECK-LABEL: @src_is_mask_lshr( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 ; CHECK-NEXT: [[MASK:%.*]] = lshr i8 [[SMASK]], [[Z:%.*]] +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -265,11 +265,11 @@ define i1 @src_is_mask_lshr(i8 %x_in, i8 %y, i8 %z, i1 %cond) { define i1 @src_is_mask_ashr(i8 %x_in, i8 %y, i8 %z, i1 %cond) { ; CHECK-LABEL: @src_is_mask_ashr( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 ; CHECK-NEXT: [[MASK:%.*]] = ashr i8 [[SMASK]], [[Z:%.*]] +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -285,9 +285,9 @@ define i1 @src_is_mask_ashr(i8 %x_in, i8 %y, i8 %z, i1 %cond) { define i1 @src_is_mask_p2_m1(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_p2_m1( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[P2ORZ:%.*]] = shl i8 2, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = add i8 [[P2ORZ]], -1 +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -301,10 +301,10 @@ define i1 @src_is_mask_p2_m1(i8 %x_in, i8 %y) { define i1 @src_is_mask_umax(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_umax( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.umax.i8(i8 [[YMASK]], i8 3) +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -320,11 +320,11 @@ define i1 @src_is_mask_umax(i8 %x_in, i8 %y) { define i1 @src_is_mask_umin(i8 %x_in, i8 %y, i8 %z) { ; CHECK-LABEL: @src_is_mask_umin( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[ZMASK:%.*]] = lshr i8 15, [[Z:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.umin.i8(i8 [[YMASK]], i8 [[ZMASK]]) +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -341,12 +341,12 @@ define i1 @src_is_mask_umin(i8 %x_in, i8 %y, i8 %z) { define i1 @src_is_mask_umin_fail_mismatch(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_umin_fail_mismatch( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.umin.i8(i8 [[YMASK]], i8 -32) -; CHECK-NEXT: [[AND:%.*]] = and i8 [[MASK]], [[X]] -; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[AND]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124 +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[MASK]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP2]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -361,10 +361,10 @@ define i1 @src_is_mask_umin_fail_mismatch(i8 %x_in, i8 %y) { define i1 @src_is_mask_smax(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_smax( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.smax.i8(i8 [[YMASK]], i8 -1) +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -380,10 +380,10 @@ define i1 @src_is_mask_smax(i8 %x_in, i8 %y) { define i1 @src_is_mask_smin(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_smin( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.smin.i8(i8 [[YMASK]], i8 0) +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -399,9 +399,9 @@ define i1 @src_is_mask_smin(i8 %x_in, i8 %y) { define i1 @src_is_mask_bitreverse_not_mask(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_bitreverse_not_mask( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[NMASK:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[NMASK]]) +; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -417,7 +417,7 @@ define i1 @src_is_mask_bitreverse_not_mask(i8 %x_in, i8 %y) { define i1 @src_is_notmask_sext(i16 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_notmask_sext( ; CHECK-NEXT: [[M_IN:%.*]] = shl i8 -8, [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], -124 +; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], -128 ; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[M_IN]] to i16 ; CHECK-NEXT: [[R:%.*]] = icmp uge i16 [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret i1 [[R]] @@ -529,12 +529,11 @@ define i1 @src_is_notmask_lshr_shl(i8 %x_in, i8 %y) { define i1 @src_is_notmask_lshr_shl_fail_mismatch_shifts(i8 %x_in, i8 %y, i8 %z) { ; CHECK-LABEL: @src_is_notmask_lshr_shl_fail_mismatch_shifts( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[MASK_SHR:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[NMASK:%.*]] = shl i8 [[MASK_SHR]], [[Z:%.*]] -; CHECK-NEXT: [[MASK:%.*]] = xor i8 [[NMASK]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], [[MASK]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[NMASK]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 diff --git a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll index 75070e5a34f949..0f26be12c39cc6 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll @@ -239,9 +239,9 @@ define i1 @icmp_sle_negx_y_fail_maybe_zero(i8 %x, i8 %y) { define i1 @icmp_eq_x_invertable_y_todo(i8 %x, i1 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y_todo( -; CHECK-NEXT: [[YY:%.*]] = select i1 [[Y:%.*]], i8 7, i8 24 +; CHECK-NEXT: [[YY:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25 ; CHECK-NEXT: [[AND:%.*]] = and i8 [[YY]], [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] ; %yy = select i1 %y, i8 7, i8 24 @@ -252,9 +252,8 @@ define i1 @icmp_eq_x_invertable_y_todo(i8 %x, i1 %y) { define i1 @icmp_eq_x_invertable_y(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y( -; CHECK-NEXT: [[YY:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[YY]], [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], [[X]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[YY:%.*]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] ; %yy = xor i8 %y, -1 @@ -280,9 +279,9 @@ define i1 @icmp_eq_x_invertable_y_fail_multiuse(i8 %x, i8 %y) { define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y2_todo( -; CHECK-NEXT: [[YY:%.*]] = select i1 [[Y:%.*]], i8 7, i8 24 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[YY]], [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[YY]], [[AND]] +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25 +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %yy = select i1 %y, i8 7, i8 24 @@ -293,9 +292,8 @@ define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y) { define i1 @icmp_eq_x_invertable_y2(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y2( -; CHECK-NEXT: [[YY:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[YY]], [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], [[YY]] +; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP1]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %yy = xor i8 %y, -1 From a9e8a3a18eb897196f88d3705ccd966f5b52c012 Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Wed, 29 May 2024 14:34:34 +0800 Subject: [PATCH 072/230] [X86][CodeGen] Extend X86CompressEVEX for NF transform --- llvm/lib/Target/X86/X86CompressEVEX.cpp | 43 ++- llvm/lib/Target/X86/X86InstrInfo.cpp | 8 + llvm/lib/Target/X86/X86InstrInfo.h | 3 + llvm/test/CodeGen/X86/apx/add.ll | 319 +++++++++++++++++++ llvm/test/CodeGen/X86/apx/and.ll | 317 +++++++++++++++++++ llvm/test/CodeGen/X86/apx/compress-evex.mir | 26 +- llvm/test/CodeGen/X86/apx/dec.ll | 64 ++++ llvm/test/CodeGen/X86/apx/imul.ll | 62 ++++ llvm/test/CodeGen/X86/apx/inc.ll | 95 ++++++ llvm/test/CodeGen/X86/apx/neg.ll | 103 +++++++ llvm/test/CodeGen/X86/apx/or.ll | 315 +++++++++++++++++++ llvm/test/CodeGen/X86/apx/shl.ll | 276 +++++++++++++++++ llvm/test/CodeGen/X86/apx/shr.ll | 277 +++++++++++++++++ llvm/test/CodeGen/X86/apx/sub.ll | 323 ++++++++++++++++++++ llvm/test/CodeGen/X86/apx/xor.ll | 292 ++++++++++++++++++ 15 files changed, 2505 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index 6442cc21933085..cadfda93d4b196 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -14,6 +14,7 @@ // b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX) // c. NDD (EVEX) -> non-NDD (legacy) // d. NF_ND (EVEX) -> NF (EVEX) +// e. NonNF (EVEX) -> NF (EVEX) // // Compression a, b and c can always reduce code size, with some exceptions // such as promoted 16-bit CRC32 which is as long as the legacy version. @@ -30,6 +31,9 @@ // // Compression d can help hardware decode (HW may skip reading the NDD // register) although the instruction length remains unchanged. +// +// Compression e can help hardware skip updating EFLAGS although the instruction +// length remains unchanged. //===----------------------------------------------------------------------===// #include "MCTargetDesc/X86BaseInfo.h" @@ -219,25 +223,36 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) { return false; // MOVBE*rr is special because it has semantic of NDD but not set EVEX_B. bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr; - if (IsNDLike && !isRedundantNewDataDest(MI, ST)) + bool IsRedundantNDD = IsNDLike ? isRedundantNewDataDest(MI, ST) : false; + // NonNF -> NF only if it's not a compressible NDD instruction and eflags is + // dead. + unsigned NFOpc = (ST.hasNF() && !IsRedundantNDD && + MI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) + ? X86::getNFVariant(Opc) + : 0U; + if (IsNDLike && !IsRedundantNDD && !NFOpc) return false; - ArrayRef Table = ArrayRef(X86CompressEVEXTable); - - Opc = MI.getOpcode(); - const auto *I = llvm::lower_bound(Table, Opc); - if (I == Table.end() || I->OldOpc != Opc) { - assert(!IsNDLike && "Missing entry for ND-like instruction"); - return false; - } + unsigned NewOpc = NFOpc; + if (!NewOpc) { + ArrayRef Table = ArrayRef(X86CompressEVEXTable); - if (!IsNDLike) { - if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) || - !performCustomAdjustments(MI, I->NewOpc)) + Opc = MI.getOpcode(); + const auto I = llvm::lower_bound(Table, Opc); + if (I == Table.end() || I->OldOpc != Opc) { + assert(!IsNDLike && "Missing entry for ND-like instruction"); return false; + } + + if (!IsNDLike) { + if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) || + !performCustomAdjustments(MI, I->NewOpc)) + return false; + } + NewOpc = I->NewOpc; } - const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc); + const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(NewOpc); MI.setDesc(NewDesc); unsigned AsmComment; switch (NewDesc.TSFlags & X86II::EncodingMask) { @@ -256,7 +271,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) { llvm_unreachable("Unknown EVEX compression"); } MI.setAsmPrinterFlag(AsmComment); - if (IsNDLike) + if (IsRedundantNDD) MI.tieOperands(0, 1); return true; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 7d05f950b6fe99..3e391da807889f 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3221,6 +3221,14 @@ int X86::getCCMPCondFlagsFromCondCode(X86::CondCode CC) { } } +#define GET_X86_NF_TRANSFORM_TABLE +#include "X86GenInstrMapping.inc" +unsigned X86::getNFVariant(unsigned Opc) { + ArrayRef Table = ArrayRef(X86NFTransformTable); + const auto I = llvm::lower_bound(Table, Opc); + return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc; +} + /// Return the inverse of the specified condition, /// e.g. turning COND_E to COND_NE. X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 295fac60c6e406..9eb2bd56b2ab5c 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -77,6 +77,9 @@ CondCode getCondFromCCMP(const MachineInstr &MI); // Turn condition code into condition flags for CCMP/CTEST. int getCCMPCondFlagsFromCondCode(CondCode CC); +// Get the opcode of corresponding NF variant. +unsigned getNFVariant(unsigned Opc); + /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(CondCode CC); diff --git a/llvm/test/CodeGen/X86/apx/add.ll b/llvm/test/CodeGen/X86/apx/add.ll index d3301ecdb72d0f..7779ae599f2004 100644 --- a/llvm/test/CodeGen/X86/apx/add.ll +++ b/llvm/test/CodeGen/X86/apx/add.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s define i8 @add8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-LABEL: add8rr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add8rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x00,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i8 %a, %b ret i8 %add @@ -17,6 +23,12 @@ define i16 @add16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-NEXT: addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add16rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xf7] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i16 %a, %b ret i16 %add @@ -27,6 +39,11 @@ define i32 @add32rr(i32 noundef %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add32rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i32 %a, %b ret i32 %add @@ -37,6 +54,11 @@ define i64 @add64rr(i64 noundef %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i64 %a, %b ret i64 %add @@ -47,6 +69,11 @@ define i8 @add8rm(i8 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add8rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x02,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %b = load i8, ptr %ptr %add = add i8 %a, %b @@ -58,6 +85,11 @@ define i16 @add16rm(i16 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add16rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x03,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %b = load i16, ptr %ptr %add = add i16 %a, %b @@ -69,6 +101,11 @@ define i32 @add32rm(i32 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add32rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x03,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %b = load i32, ptr %ptr %add = add i32 %a, %b @@ -80,6 +117,11 @@ define i64 @add64rm(i64 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x03,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %b = load i64, ptr %ptr %add = add i64 %a, %b @@ -92,6 +134,12 @@ define i16 @add16ri8(i16 noundef %a) { ; CHECK-NEXT: addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add16ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xc7,0x7b] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i16 %a, 123 ret i16 %add @@ -102,6 +150,11 @@ define i32 @add32ri8(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add32ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xc7,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i32 %a, 123 ret i32 %add @@ -112,6 +165,11 @@ define i64 @add64ri8(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xc7,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i64 %a, 123 ret i64 %add @@ -122,6 +180,11 @@ define i8 @add8ri(i8 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add8ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xc7,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i8 %a, 123 ret i8 %add @@ -134,6 +197,13 @@ define i16 @add16ri(i16 noundef %a) { ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add16ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0xd2,0x04,0x00,0x00] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i16 %a, 1234 ret i16 %add @@ -145,6 +215,12 @@ define i32 @add32ri(i32 noundef %a) { ; CHECK-NEXT: addl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add32ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i32 %a, 123456 ret i32 %add @@ -156,6 +232,12 @@ define i64 @add64ri(i64 noundef %a) { ; CHECK-NEXT: addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xc7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %add = add i64 %a, 123456 ret i64 %add @@ -166,6 +248,11 @@ define i8 @add8mr(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add8mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x00,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %add = add nsw i8 %t, %b @@ -177,6 +264,11 @@ define i16 @add16mr(ptr %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add16mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x01,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %add = add nsw i16 %t, %b @@ -188,6 +280,11 @@ define i32 @add32mr(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add32mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %add = add nsw i32 %t, %b @@ -199,6 +296,11 @@ define i64 @add64mr(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %add = add nsw i64 %t, %b @@ -212,6 +314,13 @@ define i16 @add16mi8(ptr %a) { ; CHECK-NEXT: addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add16mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %add = add nsw i16 %t, 123 @@ -223,6 +332,11 @@ define i32 @add32mi8(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x07,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add32mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x07,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %add = add nsw i32 %t, 123 @@ -234,6 +348,11 @@ define i64 @add64mi8(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x07,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x07,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %add = add nsw i64 %t, 123 @@ -245,6 +364,11 @@ define i8 @add8mi(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add8mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x07,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %add = add nsw i8 %t, 123 @@ -259,6 +383,14 @@ define i16 @add16mi(ptr %a) { ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add16mi: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %add = add nsw i16 %t, 1234 @@ -271,6 +403,12 @@ define i32 @add32mi(ptr %a) { ; CHECK-NEXT: addl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add32mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x07,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %add = add nsw i32 %t, 123456 @@ -283,6 +421,12 @@ define i64 @add64mi(ptr %a) { ; CHECK-NEXT: addq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x07,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x07,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %add = add nsw i64 %t, 123456 @@ -303,6 +447,15 @@ define i8 @addflag8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag8rr: +; NF: # %bb.0: # %entry +; NF-NEXT: addb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xf7] +; NF-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] +; NF-NEXT: movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00] +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i8 @llvm.uadd.sat.i8(i8 %a, i8 %b) ret i8 %add @@ -317,6 +470,15 @@ define i16 @addflag16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag16rr: +; NF: # %bb.0: # %entry +; NF-NEXT: addw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xf7] +; NF-NEXT: movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00] +; NF-NEXT: # imm = 0xFFFF +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 %b) ret i16 %add @@ -329,6 +491,13 @@ define i32 @addflag32rr(i32 noundef %a, i32 noundef %b) { ; CHECK-NEXT: movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff] ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag32rr: +; NF: # %bb.0: # %entry +; NF-NEXT: addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7] +; NF-NEXT: movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff] +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) ret i32 %add @@ -341,6 +510,13 @@ define i64 @addflag64rr(i64 noundef %a, i64 noundef %b) { ; CHECK-NEXT: movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff] ; CHECK-NEXT: cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag64rr: +; NF: # %bb.0: # %entry +; NF-NEXT: addq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xf7] +; NF-NEXT: movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff] +; NF-NEXT: cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 %b) ret i64 %add @@ -355,6 +531,15 @@ define i8 @addflag8rm(i8 noundef %a, ptr %b) { ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag8rm: +; NF: # %bb.0: # %entry +; NF-NEXT: addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e] +; NF-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] +; NF-NEXT: movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00] +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i8, ptr %b %add = call i8 @llvm.uadd.sat.i8(i8 %a, i8 %t) @@ -370,6 +555,15 @@ define i16 @addflag16rm(i16 noundef %a, ptr %b) { ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag16rm: +; NF: # %bb.0: # %entry +; NF-NEXT: addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e] +; NF-NEXT: movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00] +; NF-NEXT: # imm = 0xFFFF +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i16, ptr %b %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 %t) @@ -383,6 +577,13 @@ define i32 @addflag32rm(i32 noundef %a, ptr %b) { ; CHECK-NEXT: movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff] ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag32rm: +; NF: # %bb.0: # %entry +; NF-NEXT: addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e] +; NF-NEXT: movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff] +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i32, ptr %b %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %t) @@ -396,6 +597,13 @@ define i64 @addflag64rm(i64 noundef %a, ptr %b) { ; CHECK-NEXT: movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff] ; CHECK-NEXT: cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag64rm: +; NF: # %bb.0: # %entry +; NF-NEXT: addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e] +; NF-NEXT: movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff] +; NF-NEXT: cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i64, ptr %b %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 %t) @@ -411,6 +619,15 @@ define i16 @addflag16ri8(i16 noundef %a) { ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag16ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: addw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xc7,0x7b] +; NF-NEXT: movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00] +; NF-NEXT: # imm = 0xFFFF +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 123) ret i16 %add @@ -423,6 +640,13 @@ define i32 @addflag32ri8(i32 noundef %a) { ; CHECK-NEXT: movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff] ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag32ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b] +; NF-NEXT: movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff] +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123) ret i32 %add @@ -435,6 +659,13 @@ define i64 @addflag64ri8(i64 noundef %a) { ; CHECK-NEXT: movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff] ; CHECK-NEXT: cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag64ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b] +; NF-NEXT: movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff] +; NF-NEXT: cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123) ret i64 %add @@ -449,6 +680,15 @@ define i8 @addflag8ri(i8 noundef %a) { ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag8ri: +; NF: # %bb.0: # %entry +; NF-NEXT: addb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x7b] +; NF-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] +; NF-NEXT: movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00] +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i8 @llvm.uadd.sat.i8(i8 %a, i8 123) ret i8 %add @@ -464,6 +704,16 @@ define i16 @addflag16ri(i16 noundef %a) { ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag16ri: +; NF: # %bb.0: # %entry +; NF-NEXT: addw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xc7,0xd2,0x04] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00] +; NF-NEXT: # imm = 0xFFFF +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 1234) ret i16 %add @@ -477,6 +727,14 @@ define i32 @addflag32ri(i32 noundef %a) { ; CHECK-NEXT: movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff] ; CHECK-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag32ri: +; NF: # %bb.0: # %entry +; NF-NEXT: addl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff] +; NF-NEXT: cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123456) ret i32 %add @@ -490,6 +748,14 @@ define i64 @addflag64ri(i64 noundef %a) { ; CHECK-NEXT: movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff] ; CHECK-NEXT: cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: addflag64ri: +; NF: # %bb.0: # %entry +; NF-NEXT: addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff] +; NF-NEXT: cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123456) ret i64 %add @@ -507,6 +773,16 @@ define i1 @add64ri_reloc(i16 %k) { ; CHECK-NEXT: # fixup A - offset: 2, value: val, kind: reloc_signed_4byte ; CHECK-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64ri_reloc: +; NF: # %bb.0: +; NF-NEXT: # kill: def $edi killed $edi def $rdi +; NF-NEXT: movswq %di, %rax # encoding: [0x48,0x0f,0xbf,0xc7] +; NF-NEXT: addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0] +; NF-NEXT: addq $val, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: val, kind: reloc_signed_4byte +; NF-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] +; NF-NEXT: retq # encoding: [0xc3] %g = getelementptr inbounds i16, ptr @val, i16 %k %cmp = icmp ne ptr %g, null ret i1 %cmp @@ -517,6 +793,11 @@ define void @add8mr_legacy(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb %sil, (%rdi) # encoding: [0x40,0x00,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add8mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addb %sil, (%rdi) # encoding: [0x40,0x00,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %add = add i8 %t, %b @@ -529,6 +810,11 @@ define void @add16mr_legacy(ptr %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addw %si, (%rdi) # encoding: [0x66,0x01,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add16mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addw %si, (%rdi) # encoding: [0x66,0x01,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %add = add i16 %t, %b @@ -541,6 +827,11 @@ define void @add32mr_legacy(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addl %esi, (%rdi) # encoding: [0x01,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add32mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addl %esi, (%rdi) # encoding: [0x01,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %add = add i32 %t, %b @@ -553,6 +844,11 @@ define void @add64mr_legacy(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addq %rsi, (%rdi) # encoding: [0x48,0x01,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addq %rsi, (%rdi) # encoding: [0x48,0x01,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %add = add i64 %t, %b @@ -565,6 +861,11 @@ define void @add8mi_legacy(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb $123, (%rdi) # encoding: [0x80,0x07,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add8mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addb $123, (%rdi) # encoding: [0x80,0x07,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %add = add nsw i8 %t, 123 @@ -578,6 +879,12 @@ define void @add16mi_legacy(ptr %a) { ; CHECK-NEXT: addw $1234, (%rdi) # encoding: [0x66,0x81,0x07,0xd2,0x04] ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add16mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addw $1234, (%rdi) # encoding: [0x66,0x81,0x07,0xd2,0x04] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %add = add nsw i16 %t, 1234 @@ -591,6 +898,12 @@ define void @add32mi_legacy(ptr %a) { ; CHECK-NEXT: addl $123456, (%rdi) # encoding: [0x81,0x07,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add32mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addl $123456, (%rdi) # encoding: [0x81,0x07,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %add = add nsw i32 %t, 123456 @@ -604,6 +917,12 @@ define void @add64mi_legacy(ptr %a) { ; CHECK-NEXT: addq $123456, (%rdi) # encoding: [0x48,0x81,0x07,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: add64mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addq $123456, (%rdi) # encoding: [0x48,0x81,0x07,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %add = add nsw i64 %t, 123456 diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll index af8f4119ac054d..58f54fbe50a524 100644 --- a/llvm/test/CodeGen/X86/apx/and.ll +++ b/llvm/test/CodeGen/X86/apx/and.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s define i8 @and8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-LABEL: and8rr: @@ -7,6 +8,12 @@ define i8 @and8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-NEXT: andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and8rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7] +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i8 %a, %b ret i8 %and @@ -18,6 +25,12 @@ define i16 @and16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-NEXT: andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and16rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i16 %a, %b ret i16 %and @@ -28,6 +41,11 @@ define i32 @and32rr(i32 noundef %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and32rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i32 %a, %b ret i32 %and @@ -38,6 +56,11 @@ define i64 @and64rr(i64 noundef %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x21,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and64rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x21,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i64 %a, %b ret i64 %and @@ -48,6 +71,11 @@ define i8 @and8rm(i8 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x22,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and8rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x22,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i8, ptr %b %and = and i8 %a, %t @@ -59,6 +87,11 @@ define i16 @and16rm(i16 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x23,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and16rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x23,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i16, ptr %b %and = and i16 %a, %t @@ -70,6 +103,11 @@ define i32 @and32rm(i32 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x23,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and32rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x23,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i32, ptr %b %and = and i32 %a, %t @@ -81,6 +119,11 @@ define i64 @and64rm(i64 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x23,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and64rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x23,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i64, ptr %b %and = and i64 %a, %t @@ -93,6 +136,12 @@ define i16 @and16ri8(i16 noundef %a) { ; CHECK-NEXT: andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and16ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i16 %a, 123 ret i16 %and @@ -103,6 +152,11 @@ define i32 @and32ri8(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and32ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i32 %a, 123 ret i32 %and @@ -113,6 +167,11 @@ define i64 @and64ri8(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and64ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i64 %a, 123 ret i64 %and @@ -123,6 +182,11 @@ define i8 @and8ri(i8 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xe7,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and8ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xe7,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i8 %a, 123 ret i8 %and @@ -135,6 +199,13 @@ define i16 @and16ri(i16 noundef %a) { ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and16ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0xd2,0x04,0x00,0x00] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i16 %a, 1234 ret i16 %and @@ -146,6 +217,12 @@ define i32 @and32ri(i32 noundef %a) { ; CHECK-NEXT: andl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and32ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i32 %a, 123456 ret i32 %and @@ -157,6 +234,12 @@ define i64 @and64ri(i64 noundef %a) { ; CHECK-NEXT: andl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and64ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i64 %a, 123456 ret i64 %and @@ -167,6 +250,11 @@ define i8 @and8mr(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x20,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and8mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x20,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %and = and i8 %t, %b @@ -178,6 +266,11 @@ define i16 @and16mr(ptr %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x21,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and16mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x21,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %and = and i16 %t, %b @@ -189,6 +282,11 @@ define i32 @and32mr(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and32mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %and = and i32 %t, %b @@ -200,6 +298,11 @@ define i64 @and64mr(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x21,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and64mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x21,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %and = and i64 %t, %b @@ -213,6 +316,13 @@ define i16 @and16mi8(ptr %a) { ; CHECK-NEXT: andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and16mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %and = and i16 %t, 123 @@ -224,6 +334,11 @@ define i32 @and32mi8(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x27,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and32mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x27,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %and = and i32 %t, 123 @@ -236,6 +351,12 @@ define i64 @and64mi8(ptr %a) { ; CHECK-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07] ; CHECK-NEXT: andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and64mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07] +; NF-NEXT: andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %and = and i64 %t, 123 @@ -247,6 +368,11 @@ define i8 @and8mi(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x27,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and8mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x27,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %and = and i8 %t, 123 @@ -261,6 +387,14 @@ define i16 @and16mi(ptr %a) { ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and16mi: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: andl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x25,0xd2,0x04,0x00,0x00] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %and = and i16 %t, 1234 @@ -273,6 +407,12 @@ define i32 @and32mi(ptr %a) { ; CHECK-NEXT: andl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x27,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and32mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} andl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x27,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %and = and i32 %t, 123456 @@ -286,6 +426,13 @@ define i64 @and64mi(ptr %a) { ; CHECK-NEXT: andl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x25,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and64mi: +; NF: # %bb.0: # %entry +; NF-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07] +; NF-NEXT: andl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x25,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %and = and i64 %t, 123456 @@ -303,6 +450,15 @@ define i1 @andflag8rr(i8 %a, i8 %b) { ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag8rr: +; NF: # %bb.0: +; NF-NEXT: notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6] +; NF-NEXT: andb %al, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x20,0xc7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i8 %b, -1 %v0 = and i8 %a, %xor ; 0xff << 50 %v1 = icmp eq i8 %v0, 0 @@ -319,6 +475,15 @@ define i1 @andflag16rr(i16 %a, i16 %b) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag16rr: +; NF: # %bb.0: +; NF-NEXT: notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6] +; NF-NEXT: andw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x21,0xc7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 %b, -1 %v0 = and i16 %a, %xor ; 0xff << 50 %v1 = icmp eq i16 %v0, 0 @@ -334,6 +499,14 @@ define i1 @andflag32rr(i32 %a, i32 %b) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag32rr: +; NF: # %bb.0: +; NF-NEXT: andl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x21,0xf7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = and i32 %a, %b ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 store i32 %v0, ptr @d64 @@ -348,6 +521,14 @@ define i1 @andflag64rr(i64 %a, i64 %b) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag64rr: +; NF: # %bb.0: +; NF-NEXT: andq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x21,0xf7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, %b ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 store i64 %v0, ptr @d64 @@ -363,6 +544,15 @@ define i1 @andflag8rm(ptr %ptr, i8 %b) { ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag8rm: +; NF: # %bb.0: +; NF-NEXT: notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6] +; NF-NEXT: andb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x22,0x07] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i8, ptr %ptr %xor = xor i8 %b, -1 %v0 = and i8 %a, %xor ; 0xff << 50 @@ -380,6 +570,15 @@ define i1 @andflag16rm(ptr %ptr, i16 %b) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag16rm: +; NF: # %bb.0: +; NF-NEXT: notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6] +; NF-NEXT: andw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x23,0x07] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i16, ptr %ptr %xor = xor i16 %b, -1 %v0 = and i16 %a, %xor ; 0xff << 50 @@ -396,6 +595,14 @@ define i1 @andflag32rm(ptr %ptr, i32 %b) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag32rm: +; NF: # %bb.0: +; NF-NEXT: andl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x23,0x37] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i32, ptr %ptr %v0 = and i32 %a, %b ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 @@ -411,6 +618,14 @@ define i1 @andflag64rm(ptr %ptr, i64 %b) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag64rm: +; NF: # %bb.0: +; NF-NEXT: andq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x23,0x37] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i64, ptr %ptr %v0 = and i64 %a, %b ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 @@ -426,6 +641,14 @@ define i1 @andflag8ri(i8 %a) { ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag8ri: +; NF: # %bb.0: +; NF-NEXT: andb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xe7,0x84] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i8 123, -1 %v0 = and i8 %a, %xor ; 0xff << 50 %v1 = icmp eq i8 %v0, 0 @@ -442,6 +665,15 @@ define i1 @andflag16ri(i16 %a) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag16ri: +; NF: # %bb.0: +; NF-NEXT: andw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xe7,0x2d,0xfb] +; NF-NEXT: # imm = 0xFB2D +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 1234, -1 %v0 = and i16 %a, %xor ; 0xff << 50 %v1 = icmp eq i16 %v0, 0 @@ -458,6 +690,15 @@ define i1 @andflag32ri(i32 %a) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag32ri: +; NF: # %bb.0: +; NF-NEXT: andl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = and i32 %a, 123456 ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 store i32 %v0, ptr @d64 @@ -473,6 +714,15 @@ define i1 @andflag64ri(i64 %a) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag64ri: +; NF: # %bb.0: +; NF-NEXT: andq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 123456 ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 store i64 %v0, ptr @d64 @@ -487,6 +737,14 @@ define i1 @andflag16ri8(i16 %a) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag16ri8: +; NF: # %bb.0: +; NF-NEXT: andw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xe7,0x84] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 123, -1 %v0 = and i16 %a, %xor ; 0xff << 50 %v1 = icmp eq i16 %v0, 0 @@ -502,6 +760,14 @@ define i1 @andflag32ri8(i32 %a) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag32ri8: +; NF: # %bb.0: +; NF-NEXT: andl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xe7,0x7b] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = and i32 %a, 123 ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 store i32 %v0, ptr @d64 @@ -516,6 +782,14 @@ define i1 @andflag64ri8(i64 %a) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: andflag64ri8: +; NF: # %bb.0: +; NF-NEXT: andq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xe7,0x7b] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 123 ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 store i64 %v0, ptr @d64 @@ -527,6 +801,11 @@ define void @and8mr_legacy(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andb %sil, (%rdi) # encoding: [0x40,0x20,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and8mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: andb %sil, (%rdi) # encoding: [0x40,0x20,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %and = and i8 %t, %b @@ -539,6 +818,11 @@ define void @and16mr_legacy(ptr %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andw %si, (%rdi) # encoding: [0x66,0x21,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and16mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: andw %si, (%rdi) # encoding: [0x66,0x21,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %and = and i16 %t, %b @@ -551,6 +835,11 @@ define void @and32mr_legacy(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andl %esi, (%rdi) # encoding: [0x21,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and32mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: andl %esi, (%rdi) # encoding: [0x21,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %and = and i32 %t, %b @@ -563,6 +852,11 @@ define void @and64mr_legacy(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andq %rsi, (%rdi) # encoding: [0x48,0x21,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and64mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: andq %rsi, (%rdi) # encoding: [0x48,0x21,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %and = and i64 %t, %b @@ -575,6 +869,11 @@ define void @and8mi_legacy(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: andb $123, (%rdi) # encoding: [0x80,0x27,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and8mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: andb $123, (%rdi) # encoding: [0x80,0x27,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %and = and i8 %t, 123 @@ -588,6 +887,12 @@ define void @and16mi_legacy(ptr %a) { ; CHECK-NEXT: andw $1234, (%rdi) # encoding: [0x66,0x81,0x27,0xd2,0x04] ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and16mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: andw $1234, (%rdi) # encoding: [0x66,0x81,0x27,0xd2,0x04] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %and = and i16 %t, 1234 @@ -601,6 +906,12 @@ define void @and32mi_legacy(ptr %a) { ; CHECK-NEXT: andl $123456, (%rdi) # encoding: [0x81,0x27,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and32mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: andl $123456, (%rdi) # encoding: [0x81,0x27,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %and = and i32 %t, 123456 @@ -614,6 +925,12 @@ define void @and64mi_legacy(ptr %a) { ; CHECK-NEXT: andq $123456, (%rdi) # encoding: [0x48,0x81,0x27,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: and64mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: andq $123456, (%rdi) # encoding: [0x48,0x81,0x27,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %and = and i64 %t, 123456 diff --git a/llvm/test/CodeGen/X86/apx/compress-evex.mir b/llvm/test/CodeGen/X86/apx/compress-evex.mir index d8bef886e234f9..626904a7a692c1 100644 --- a/llvm/test/CodeGen/X86/apx/compress-evex.mir +++ b/llvm/test/CodeGen/X86/apx/compress-evex.mir @@ -1,4 +1,5 @@ -# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=x86-compress-evex -show-mc-encoding -o - | FileCheck %s +# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=x86-compress-evex -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,NDD %s +# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr,+nf -start-before=x86-compress-evex -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,NDD-NF %s ... --- @@ -46,7 +47,8 @@ name: ndd_2_non_ndd_incommutable body: | bb.0.entry: liveins: $rdi, $rsi - ; CHECK: subq %rax, %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc6] + ; NDD: subq %rax, %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc6] + ; NDD-NF: {nf} subq %rax, %rsi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0xc6] renamable $rax = ADD64rr_ND killed renamable $rdi, renamable $rsi, implicit-def dead $eflags renamable $rax = SUB64rr_ND killed renamable $rsi, killed renamable $rax, implicit-def dead $eflags RET64 $rax @@ -55,7 +57,8 @@ body: | name: ndd_2_non_ndd_mem body: | bb.0.entry: - ; CHECK: addq $123456, (%rax), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x00,0x40,0xe2,0x01,0x00] + ; NDD: addq $123456, (%rax), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x00,0x40,0xe2,0x01,0x00] + ; NDD-NF: {nf} addq $123456, (%rax), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x00,0x40,0xe2,0x01,0x00] renamable $rax = MOV64rm $noreg, 1, $noreg, 0, $fs renamable $rax = nsw ADD64mi32_ND killed renamable $rax, 1, $noreg, 0, $noreg, 123456, implicit-def dead $eflags RET64 $rax @@ -88,5 +91,20 @@ body: | ; CHECK: bswapq %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0xc8] renamable $rax = MOVBE64rr killed renamable $rax RET64 killed $rax - +... +--- +name: non_nf_2_nf +body: | + bb.0.entry: + liveins: $rdi, $r16 + ; CHECK: addq %r16, %rdi # encoding: [0xd5,0x48,0x01,0xc7] + ; NDD: xorq %r16, %rdi, %rax # encoding: [0x62,0xe4,0xfc,0x18,0x31,0xc7] + ; NDD-NF: {nf} xorq %r16, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xe4,0xfc,0x1c,0x31,0xc7] + ; CHECK: addq %r16, %rax, %rdi # encoding: [0x62,0xe4,0xc4,0x18,0x01,0xc0] + ; CHECK: adcq %rdi, %r16, %rax # encoding: [0x62,0xfc,0xfc,0x18,0x11,0xf8] + $rdi = ADD64rr $rdi, $r16, implicit-def dead $eflags + $rax = XOR64rr_ND $rdi, $r16, implicit-def dead $eflags + $rdi = ADD64rr_ND $rax, $r16, implicit-def $eflags + $rax = ADC64rr_ND $r16, $rdi, implicit-def dead $eflags, implicit $eflags + RET64 $rax ... diff --git a/llvm/test/CodeGen/X86/apx/dec.ll b/llvm/test/CodeGen/X86/apx/dec.ll index fcb2cae3b5cad8..a18ed2ace603ab 100644 --- a/llvm/test/CodeGen/X86/apx/dec.ll +++ b/llvm/test/CodeGen/X86/apx/dec.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s define i8 @dec8r(i8 noundef %a) { ; CHECK-LABEL: dec8r: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decb %dil, %al ; CHECK-NEXT: retq +; +; NF-LABEL: dec8r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} decb %dil, %al +; NF-NEXT: retq entry: %dec = sub i8 %a, 1 ret i8 %dec @@ -17,6 +23,12 @@ define i16 @dec16r(i16 noundef %a) { ; CHECK-NEXT: decl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq +; +; NF-LABEL: dec16r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} decl %edi, %eax +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq entry: %dec = sub i16 %a, 1 ret i16 %dec @@ -27,6 +39,11 @@ define i32 @dec32r(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decl %edi, %eax ; CHECK-NEXT: retq +; +; NF-LABEL: dec32r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} decl %edi, %eax +; NF-NEXT: retq entry: %dec = sub i32 %a, 1 ret i32 %dec @@ -37,6 +54,11 @@ define i64 @dec64r(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decq %rdi, %rax ; CHECK-NEXT: retq +; +; NF-LABEL: dec64r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} decq %rdi, %rax +; NF-NEXT: retq entry: %dec = sub i64 %a, 1 ret i64 %dec @@ -47,6 +69,11 @@ define i8 @dec8m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decb (%rdi), %al ; CHECK-NEXT: retq +; +; NF-LABEL: dec8m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} decb (%rdi), %al +; NF-NEXT: retq entry: %a = load i8, ptr %ptr %dec = sub i8 %a, 1 @@ -60,6 +87,13 @@ define i16 @dec16m(ptr %ptr) { ; CHECK-NEXT: decl %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq +; +; NF-LABEL: dec16m: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax +; NF-NEXT: decl %eax +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq entry: %a = load i16, ptr %ptr %dec = sub i16 %a, 1 @@ -71,6 +105,11 @@ define i32 @dec32m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decl (%rdi), %eax ; CHECK-NEXT: retq +; +; NF-LABEL: dec32m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} decl (%rdi), %eax +; NF-NEXT: retq entry: %a = load i32, ptr %ptr %dec = sub i32 %a, 1 @@ -82,6 +121,11 @@ define i64 @dec64m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decq (%rdi), %rax ; CHECK-NEXT: retq +; +; NF-LABEL: dec64m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} decq (%rdi), %rax +; NF-NEXT: retq entry: %a = load i64, ptr %ptr %dec = sub i64 %a, 1 @@ -93,6 +137,11 @@ define void @dec8m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decb (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: dec8m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: decb (%rdi) +; NF-NEXT: retq entry: %a = load i8, ptr %ptr %dec = sub i8 %a, 1 @@ -105,6 +154,11 @@ define void @dec16m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decw (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: dec16m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: decw (%rdi) +; NF-NEXT: retq entry: %a = load i16, ptr %ptr %dec = sub i16 %a, 1 @@ -117,6 +171,11 @@ define void @dec32m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decl (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: dec32m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: decl (%rdi) +; NF-NEXT: retq entry: %a = load i32, ptr %ptr %dec = sub i32 %a, 1 @@ -129,6 +188,11 @@ define void @dec64m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: decq (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: dec64m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: decq (%rdi) +; NF-NEXT: retq entry: %a = load i64, ptr %ptr %dec = sub i64 %a, 1 diff --git a/llvm/test/CodeGen/X86/apx/imul.ll b/llvm/test/CodeGen/X86/apx/imul.ll index 2963a6477be4c1..d97b2c0baec5e2 100644 --- a/llvm/test/CodeGen/X86/apx/imul.ll +++ b/llvm/test/CodeGen/X86/apx/imul.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s define i16 @mul16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-LABEL: mul16rr: @@ -7,6 +8,12 @@ define i16 @mul16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-NEXT: imull %esi, %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq +; +; NF-LABEL: mul16rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imull %esi, %edi, %eax +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq entry: %mul = mul i16 %a, %b ret i16 %mul @@ -17,6 +24,11 @@ define i32 @mul32rr(i32 noundef %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imull %esi, %edi, %eax ; CHECK-NEXT: retq +; +; NF-LABEL: mul32rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imull %esi, %edi, %eax +; NF-NEXT: retq entry: %mul = mul i32 %a, %b ret i32 %mul @@ -27,6 +39,11 @@ define i64 @mul64rr(i64 noundef %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imulq %rsi, %rdi, %rax ; CHECK-NEXT: retq +; +; NF-LABEL: mul64rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imulq %rsi, %rdi, %rax +; NF-NEXT: retq entry: %mul = mul i64 %a, %b ret i64 %mul @@ -37,6 +54,11 @@ define i16 @smul16rr(i16 noundef %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imulw %si, %di, %ax ; CHECK-NEXT: retq +; +; NF-LABEL: smul16rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imulw %si, %di, %ax +; NF-NEXT: retq entry: %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a, i16 %b) %mul = extractvalue {i16, i1} %t, 0 @@ -48,6 +70,11 @@ define i32 @smul32rr(i32 noundef %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imull %esi, %edi, %eax ; CHECK-NEXT: retq +; +; NF-LABEL: smul32rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imull %esi, %edi, %eax +; NF-NEXT: retq entry: %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b) %mul = extractvalue {i32, i1} %t, 0 @@ -59,6 +86,11 @@ define i64 @smul64rr(i64 noundef %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imulq %rsi, %rdi, %rax ; CHECK-NEXT: retq +; +; NF-LABEL: smul64rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imulq %rsi, %rdi, %rax +; NF-NEXT: retq entry: %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a, i64 %b) %mul = extractvalue {i64, i1} %t, 0 @@ -70,6 +102,11 @@ define i16 @mul16rm(i16 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imulw (%rsi), %di, %ax ; CHECK-NEXT: retq +; +; NF-LABEL: mul16rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imulw (%rsi), %di, %ax +; NF-NEXT: retq entry: %b = load i16, ptr %ptr %mul = mul i16 %a, %b @@ -81,6 +118,11 @@ define i32 @mul32rm(i32 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imull (%rsi), %edi, %eax ; CHECK-NEXT: retq +; +; NF-LABEL: mul32rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imull (%rsi), %edi, %eax +; NF-NEXT: retq entry: %b = load i32, ptr %ptr %mul = mul i32 %a, %b @@ -92,6 +134,11 @@ define i64 @mul64rm(i64 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imulq (%rsi), %rdi, %rax ; CHECK-NEXT: retq +; +; NF-LABEL: mul64rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imulq (%rsi), %rdi, %rax +; NF-NEXT: retq entry: %b = load i64, ptr %ptr %mul = mul i64 %a, %b @@ -103,6 +150,11 @@ define i16 @smul16rm(i16 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imulw (%rsi), %di, %ax ; CHECK-NEXT: retq +; +; NF-LABEL: smul16rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imulw (%rsi), %di, %ax +; NF-NEXT: retq entry: %b = load i16, ptr %ptr %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a, i16 %b) @@ -115,6 +167,11 @@ define i32 @smul32rm(i32 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imull (%rsi), %edi, %eax ; CHECK-NEXT: retq +; +; NF-LABEL: smul32rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imull (%rsi), %edi, %eax +; NF-NEXT: retq entry: %b = load i32, ptr %ptr %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b) @@ -127,6 +184,11 @@ define i64 @smul64rm(i64 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: imulq (%rsi), %rdi, %rax ; CHECK-NEXT: retq +; +; NF-LABEL: smul64rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} imulq (%rsi), %rdi, %rax +; NF-NEXT: retq entry: %b = load i64, ptr %ptr %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a, i64 %b) diff --git a/llvm/test/CodeGen/X86/apx/inc.ll b/llvm/test/CodeGen/X86/apx/inc.ll index a9c6d740cf2cee..8d31badb997797 100644 --- a/llvm/test/CodeGen/X86/apx/inc.ll +++ b/llvm/test/CodeGen/X86/apx/inc.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s define i8 @inc8r(i8 noundef %a) { ; CHECK-LABEL: inc8r: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incb %dil, %al ; CHECK-NEXT: retq +; +; NF-LABEL: inc8r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} incb %dil, %al +; NF-NEXT: retq entry: %inc = add i8 %a, 1 ret i8 %inc @@ -17,6 +23,12 @@ define i16 @inc16r(i16 noundef %a) { ; CHECK-NEXT: incl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq +; +; NF-LABEL: inc16r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} incl %edi, %eax +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq entry: %inc = add i16 %a, 1 ret i16 %inc @@ -27,6 +39,11 @@ define i32 @inc32r(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incl %edi, %eax ; CHECK-NEXT: retq +; +; NF-LABEL: inc32r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} incl %edi, %eax +; NF-NEXT: retq entry: %inc = add i32 %a, 1 ret i32 %inc @@ -37,6 +54,11 @@ define i64 @inc64r(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incq %rdi, %rax ; CHECK-NEXT: retq +; +; NF-LABEL: inc64r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} incq %rdi, %rax +; NF-NEXT: retq entry: %inc = add i64 %a, 1 ret i64 %inc @@ -47,6 +69,11 @@ define i8 @inc8m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incb (%rdi), %al ; CHECK-NEXT: retq +; +; NF-LABEL: inc8m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} incb (%rdi), %al +; NF-NEXT: retq entry: %a = load i8, ptr %ptr %inc = add i8 %a, 1 @@ -60,6 +87,13 @@ define i16 @inc16m(ptr %ptr) { ; CHECK-NEXT: incl %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq +; +; NF-LABEL: inc16m: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax +; NF-NEXT: incl %eax +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq entry: %a = load i16, ptr %ptr %inc = add i16 %a, 1 @@ -71,6 +105,11 @@ define i32 @inc32m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incl (%rdi), %eax ; CHECK-NEXT: retq +; +; NF-LABEL: inc32m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} incl (%rdi), %eax +; NF-NEXT: retq entry: %a = load i32, ptr %ptr %inc = add i32 %a, 1 @@ -82,6 +121,11 @@ define i64 @inc64m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incq (%rdi), %rax ; CHECK-NEXT: retq +; +; NF-LABEL: inc64m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} incq (%rdi), %rax +; NF-NEXT: retq entry: %a = load i64, ptr %ptr %inc = add i64 %a, 1 @@ -97,6 +141,15 @@ define i8 @uinc8r(i8 noundef %a) { ; CHECK-NEXT: cmovel %ecx, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq +; +; NF-LABEL: uinc8r: +; NF: # %bb.0: # %entry +; NF-NEXT: incb %dil, %al +; NF-NEXT: movzbl %al, %eax +; NF-NEXT: movl $255, %ecx +; NF-NEXT: cmovel %ecx, %eax +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq entry: %inc = call i8 @llvm.uadd.sat.i8(i8 %a, i8 1) ret i8 %inc @@ -110,6 +163,14 @@ define i16 @uinc16r(i16 noundef %a) { ; CHECK-NEXT: cmovel %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq +; +; NF-LABEL: uinc16r: +; NF: # %bb.0: # %entry +; NF-NEXT: incw %di, %ax +; NF-NEXT: movl $65535, %ecx # imm = 0xFFFF +; NF-NEXT: cmovel %ecx, %eax +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq entry: %inc = call i16 @llvm.uadd.sat.i16(i16 %a, i16 1) ret i16 %inc @@ -122,6 +183,13 @@ define i32 @uinc32r(i32 noundef %a) { ; CHECK-NEXT: movl $-1, %ecx ; CHECK-NEXT: cmovel %ecx, %eax ; CHECK-NEXT: retq +; +; NF-LABEL: uinc32r: +; NF: # %bb.0: # %entry +; NF-NEXT: incl %edi, %eax +; NF-NEXT: movl $-1, %ecx +; NF-NEXT: cmovel %ecx, %eax +; NF-NEXT: retq entry: %inc = call i32 @llvm.uadd.sat.i32(i32 %a, i32 1) ret i32 %inc @@ -134,6 +202,13 @@ define i64 @uinc64r(i64 noundef %a) { ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmoveq %rcx, %rax ; CHECK-NEXT: retq +; +; NF-LABEL: uinc64r: +; NF: # %bb.0: # %entry +; NF-NEXT: incq %rdi, %rax +; NF-NEXT: movq $-1, %rcx +; NF-NEXT: cmoveq %rcx, %rax +; NF-NEXT: retq entry: %inc = call i64 @llvm.uadd.sat.i64(i64 %a, i64 1) ret i64 %inc @@ -149,6 +224,11 @@ define void @inc8m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incb (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: inc8m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: incb (%rdi) +; NF-NEXT: retq entry: %a = load i8, ptr %ptr %inc = add i8 %a, 1 @@ -161,6 +241,11 @@ define void @inc16m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incw (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: inc16m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: incw (%rdi) +; NF-NEXT: retq entry: %a = load i16, ptr %ptr %inc = add i16 %a, 1 @@ -173,6 +258,11 @@ define void @inc32m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incl (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: inc32m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: incl (%rdi) +; NF-NEXT: retq entry: %a = load i32, ptr %ptr %inc = add i32 %a, 1 @@ -185,6 +275,11 @@ define void @inc64m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: incq (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: inc64m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: incq (%rdi) +; NF-NEXT: retq entry: %a = load i64, ptr %ptr %inc = add i64 %a, 1 diff --git a/llvm/test/CodeGen/X86/apx/neg.ll b/llvm/test/CodeGen/X86/apx/neg.ll index c1c53fbdaebd82..5e033e33cb8b2a 100644 --- a/llvm/test/CodeGen/X86/apx/neg.ll +++ b/llvm/test/CodeGen/X86/apx/neg.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s define i8 @neg8r(i8 noundef %a) { ; CHECK-LABEL: neg8r: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negb %dil, %al ; CHECK-NEXT: retq +; +; NF-LABEL: neg8r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negb %dil, %al +; NF-NEXT: retq entry: %neg = sub i8 0, %a ret i8 %neg @@ -17,6 +23,12 @@ define i16 @neg16r(i16 noundef %a) { ; CHECK-NEXT: negl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq +; +; NF-LABEL: neg16r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negl %edi, %eax +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq entry: %neg = sub i16 0, %a ret i16 %neg @@ -27,6 +39,11 @@ define i32 @neg32r(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negl %edi, %eax ; CHECK-NEXT: retq +; +; NF-LABEL: neg32r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negl %edi, %eax +; NF-NEXT: retq entry: %neg = sub i32 0, %a ret i32 %neg @@ -37,6 +54,11 @@ define i64 @neg64r(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negq %rdi, %rax ; CHECK-NEXT: retq +; +; NF-LABEL: neg64r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negq %rdi, %rax +; NF-NEXT: retq entry: %neg = sub i64 0, %a ret i64 %neg @@ -47,6 +69,11 @@ define i8 @neg8m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negb (%rdi), %al ; CHECK-NEXT: retq +; +; NF-LABEL: neg8m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negb (%rdi), %al +; NF-NEXT: retq entry: %a = load i8, ptr %ptr %neg = sub i8 0, %a @@ -58,6 +85,11 @@ define i16 @neg16m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negw (%rdi), %ax ; CHECK-NEXT: retq +; +; NF-LABEL: neg16m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negw (%rdi), %ax +; NF-NEXT: retq entry: %a = load i16, ptr %ptr %neg = sub i16 0, %a @@ -69,6 +101,11 @@ define i32 @neg32m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negl (%rdi), %eax ; CHECK-NEXT: retq +; +; NF-LABEL: neg32m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negl (%rdi), %eax +; NF-NEXT: retq entry: %a = load i32, ptr %ptr %neg = sub i32 0, %a @@ -80,6 +117,11 @@ define i64 @neg64m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negq (%rdi), %rax ; CHECK-NEXT: retq +; +; NF-LABEL: neg64m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negq (%rdi), %rax +; NF-NEXT: retq entry: %a = load i64, ptr %ptr %neg = sub i64 0, %a @@ -91,6 +133,11 @@ define i8 @uneg8r(i8 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negb %dil, %al ; CHECK-NEXT: retq +; +; NF-LABEL: uneg8r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negb %dil, %al +; NF-NEXT: retq entry: %t = call {i8, i1} @llvm.usub.with.overflow.i8(i8 0, i8 %a) %neg = extractvalue {i8, i1} %t, 0 @@ -103,6 +150,12 @@ define i16 @uneg16r(i16 noundef %a) { ; CHECK-NEXT: negl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq +; +; NF-LABEL: uneg16r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negl %edi, %eax +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq entry: %t = call {i16, i1} @llvm.usub.with.overflow.i16(i16 0, i16 %a) %neg = extractvalue {i16, i1} %t, 0 @@ -114,6 +167,11 @@ define i32 @uneg32r(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negl %edi, %eax ; CHECK-NEXT: retq +; +; NF-LABEL: uneg32r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negl %edi, %eax +; NF-NEXT: retq entry: %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 0, i32 %a) %neg = extractvalue {i32, i1} %t, 0 @@ -125,6 +183,11 @@ define i64 @uneg64r(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negq %rdi, %rax ; CHECK-NEXT: retq +; +; NF-LABEL: uneg64r: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negq %rdi, %rax +; NF-NEXT: retq entry: %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 0, i64 %a) %neg = extractvalue {i64, i1} %t, 0 @@ -136,6 +199,11 @@ define i8 @uneg8m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negb (%rdi), %al ; CHECK-NEXT: retq +; +; NF-LABEL: uneg8m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negb (%rdi), %al +; NF-NEXT: retq entry: %a = load i8, ptr %ptr %t = call {i8, i1} @llvm.usub.with.overflow.i8(i8 0, i8 %a) @@ -148,6 +216,11 @@ define i16 @uneg16m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negw (%rdi), %ax ; CHECK-NEXT: retq +; +; NF-LABEL: uneg16m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negw (%rdi), %ax +; NF-NEXT: retq entry: %a = load i16, ptr %ptr %t = call {i16, i1} @llvm.usub.with.overflow.i16(i16 0, i16 %a) @@ -160,6 +233,11 @@ define i32 @uneg32m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negl (%rdi), %eax ; CHECK-NEXT: retq +; +; NF-LABEL: uneg32m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negl (%rdi), %eax +; NF-NEXT: retq entry: %a = load i32, ptr %ptr %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 0, i32 %a) @@ -172,6 +250,11 @@ define i64 @uneg64m(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negq (%rdi), %rax ; CHECK-NEXT: retq +; +; NF-LABEL: uneg64m: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} negq (%rdi), %rax +; NF-NEXT: retq entry: %a = load i64, ptr %ptr %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 0, i64 %a) @@ -189,6 +272,11 @@ define void @neg8m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negb (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: neg8m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: negb (%rdi) +; NF-NEXT: retq entry: %a = load i8, ptr %ptr %neg = sub i8 0, %a @@ -201,6 +289,11 @@ define void @neg16m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negw (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: neg16m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: negw (%rdi) +; NF-NEXT: retq entry: %a = load i16, ptr %ptr %neg = sub i16 0, %a @@ -213,6 +306,11 @@ define void @neg32m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negl (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: neg32m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: negl (%rdi) +; NF-NEXT: retq entry: %a = load i32, ptr %ptr %neg = sub i32 0, %a @@ -225,6 +323,11 @@ define void @neg64m_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: negq (%rdi) ; CHECK-NEXT: retq +; +; NF-LABEL: neg64m_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: negq (%rdi) +; NF-NEXT: retq entry: %a = load i64, ptr %ptr %neg = sub i64 0, %a diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll index 3d024e962400fa..d404279e14f7ab 100644 --- a/llvm/test/CodeGen/X86/apx/or.ll +++ b/llvm/test/CodeGen/X86/apx/or.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s define i8 @or8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-LABEL: or8rr: @@ -7,6 +8,12 @@ define i8 @or8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-NEXT: orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or8rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7] +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i8 %a, %b ret i8 %or @@ -18,6 +25,12 @@ define i16 @or16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-NEXT: orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or16rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i16 %a, %b ret i16 %or @@ -28,6 +41,11 @@ define i32 @or32rr(i32 noundef %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or32rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i32 %a, %b ret i32 %or @@ -38,6 +56,11 @@ define i64 @or64rr(i64 noundef %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x09,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or64rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x09,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i64 %a, %b ret i64 %or @@ -48,6 +71,11 @@ define i8 @or8rm(i8 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x0a,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or8rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0a,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i8, ptr %b %or = or i8 %a, %t @@ -59,6 +87,11 @@ define i16 @or16rm(i16 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x0b,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or16rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x0b,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i16, ptr %b %or = or i16 %a, %t @@ -70,6 +103,11 @@ define i32 @or32rm(i32 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x0b,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or32rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0b,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i32, ptr %b %or = or i32 %a, %t @@ -81,6 +119,11 @@ define i64 @or64rm(i64 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x0b,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or64rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x0b,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i64, ptr %b %or = or i64 %a, %t @@ -93,6 +136,12 @@ define i16 @or16ri8(i16 noundef %a) { ; CHECK-NEXT: orl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xcf,0x7b] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or16ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xcf,0x7b] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i16 %a, 123 ret i16 %or @@ -103,6 +152,11 @@ define i32 @or32ri8(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xcf,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or32ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xcf,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i32 %a, 123 ret i32 %or @@ -113,6 +167,11 @@ define i64 @or64ri8(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xcf,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or64ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xcf,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i64 %a, 123 ret i64 %or @@ -123,6 +182,11 @@ define i8 @or8ri(i8 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xcf,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or8ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xcf,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i8 %a, 123 ret i8 %or @@ -135,6 +199,13 @@ define i16 @or16ri(i16 noundef %a) { ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or16ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xcf,0xd2,0x04,0x00,0x00] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i16 %a, 1234 ret i16 %or @@ -146,6 +217,12 @@ define i32 @or32ri(i32 noundef %a) { ; CHECK-NEXT: orl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or32ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xcf,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i32 %a, 123456 ret i32 %or @@ -157,6 +234,12 @@ define i64 @or64ri(i64 noundef %a) { ; CHECK-NEXT: orq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or64ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xcf,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i64 %a, 123456 ret i64 %or @@ -167,6 +250,11 @@ define i8 @or8mr(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x08,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or8mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x08,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %or = or i8 %t, %b @@ -178,6 +266,11 @@ define i16 @or16mr(ptr %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x09,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or16mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x09,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %or = or i16 %t, %b @@ -189,6 +282,11 @@ define i32 @or32mr(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or32mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %or = or i32 %t, %b @@ -200,6 +298,11 @@ define i64 @or64mr(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x09,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or64mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x09,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %or = or i64 %t, %b @@ -213,6 +316,13 @@ define i16 @or16mi8(ptr %a) { ; CHECK-NEXT: orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or16mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %or = or i16 %t, 123 @@ -224,6 +334,11 @@ define i32 @or32mi8(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x0f,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or32mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x0f,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %or = or i32 %t, 123 @@ -235,6 +350,11 @@ define i64 @or64mi8(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x0f,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or64mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x0f,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %or = or i64 %t, 123 @@ -246,6 +366,11 @@ define i8 @or8mi(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x0f,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or8mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x0f,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %or = or i8 %t, 123 @@ -260,6 +385,14 @@ define i16 @or16mi(ptr %a) { ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or16mi: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: orl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x0d,0xd2,0x04,0x00,0x00] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %or = or i16 %t, 1234 @@ -272,6 +405,12 @@ define i32 @or32mi(ptr %a) { ; CHECK-NEXT: orl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or32mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x0f,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %or = or i32 %t, 123456 @@ -284,6 +423,12 @@ define i64 @or64mi(ptr %a) { ; CHECK-NEXT: orq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or64mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} orq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x0f,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %or = or i64 %t, 123456 @@ -301,6 +446,15 @@ define i1 @orflag8rr(i8 %a, i8 %b) { ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag8rr: +; NF: # %bb.0: +; NF-NEXT: notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6] +; NF-NEXT: orb %al, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x08,0xc7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i8 %b, -1 %v0 = or i8 %a, %xor ; 0xff << 50 %v1 = icmp eq i8 %v0, 0 @@ -317,6 +471,15 @@ define i1 @orflag16rr(i16 %a, i16 %b) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag16rr: +; NF: # %bb.0: +; NF-NEXT: notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6] +; NF-NEXT: orw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x09,0xc7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 %b, -1 %v0 = or i16 %a, %xor ; 0xff << 50 %v1 = icmp eq i16 %v0, 0 @@ -332,6 +495,14 @@ define i1 @orflag32rr(i32 %a, i32 %b) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag32rr: +; NF: # %bb.0: +; NF-NEXT: orl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x09,0xf7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = or i32 %a, %b ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 store i32 %v0, ptr @d64 @@ -346,6 +517,14 @@ define i1 @orflag64rr(i64 %a, i64 %b) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag64rr: +; NF: # %bb.0: +; NF-NEXT: orq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x09,0xf7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = or i64 %a, %b ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 store i64 %v0, ptr @d64 @@ -361,6 +540,15 @@ define i1 @orflag8rm(ptr %ptr, i8 %b) { ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag8rm: +; NF: # %bb.0: +; NF-NEXT: notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6] +; NF-NEXT: orb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x0a,0x07] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i8, ptr %ptr %xor = xor i8 %b, -1 %v0 = or i8 %a, %xor ; 0xff << 50 @@ -378,6 +566,15 @@ define i1 @orflag16rm(ptr %ptr, i16 %b) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag16rm: +; NF: # %bb.0: +; NF-NEXT: notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6] +; NF-NEXT: orw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x0b,0x07] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i16, ptr %ptr %xor = xor i16 %b, -1 %v0 = or i16 %a, %xor ; 0xff << 50 @@ -394,6 +591,14 @@ define i1 @orflag32rm(ptr %ptr, i32 %b) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag32rm: +; NF: # %bb.0: +; NF-NEXT: orl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x0b,0x37] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i32, ptr %ptr %v0 = or i32 %a, %b ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 @@ -409,6 +614,14 @@ define i1 @orflag64rm(ptr %ptr, i64 %b) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag64rm: +; NF: # %bb.0: +; NF-NEXT: orq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x0b,0x37] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i64, ptr %ptr %v0 = or i64 %a, %b ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 @@ -424,6 +637,14 @@ define i1 @orflag8ri(i8 %a) { ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag8ri: +; NF: # %bb.0: +; NF-NEXT: orb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xcf,0x84] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i8 123, -1 %v0 = or i8 %a, %xor ; 0xff << 50 %v1 = icmp eq i8 %v0, 0 @@ -440,6 +661,15 @@ define i1 @orflag16ri(i16 %a) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag16ri: +; NF: # %bb.0: +; NF-NEXT: orw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xcf,0x2d,0xfb] +; NF-NEXT: # imm = 0xFB2D +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 1234, -1 %v0 = or i16 %a, %xor ; 0xff << 50 %v1 = icmp eq i16 %v0, 0 @@ -456,6 +686,15 @@ define i1 @orflag32ri(i32 %a) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag32ri: +; NF: # %bb.0: +; NF-NEXT: orl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = or i32 %a, 123456 ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 store i32 %v0, ptr @d64 @@ -471,6 +710,15 @@ define i1 @orflag64ri(i64 %a) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag64ri: +; NF: # %bb.0: +; NF-NEXT: orq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = or i64 %a, 123456 ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 store i64 %v0, ptr @d64 @@ -485,6 +733,14 @@ define i1 @orflag16ri8(i16 %a) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag16ri8: +; NF: # %bb.0: +; NF-NEXT: orw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xcf,0x84] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 123, -1 %v0 = or i16 %a, %xor ; 0xff << 50 %v1 = icmp eq i16 %v0, 0 @@ -500,6 +756,14 @@ define i1 @orflag32ri8(i32 %a) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag32ri8: +; NF: # %bb.0: +; NF-NEXT: orl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xcf,0x7b] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = or i32 %a, 123 ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 store i32 %v0, ptr @d64 @@ -514,6 +778,14 @@ define i1 @orflag64ri8(i64 %a) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: orflag64ri8: +; NF: # %bb.0: +; NF-NEXT: orq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xcf,0x7b] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = or i64 %a, 123 ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 store i64 %v0, ptr @d64 @@ -525,6 +797,11 @@ define void @or8mr_legacy(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orb %sil, (%rdi) # encoding: [0x40,0x08,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or8mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: orb %sil, (%rdi) # encoding: [0x40,0x08,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %or = or i8 %t, %b @@ -537,6 +814,11 @@ define void @or16mr_legacy(ptr %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orw %si, (%rdi) # encoding: [0x66,0x09,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or16mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: orw %si, (%rdi) # encoding: [0x66,0x09,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %or = or i16 %t, %b @@ -549,6 +831,11 @@ define void @or32mr_legacy(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orl %esi, (%rdi) # encoding: [0x09,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or32mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: orl %esi, (%rdi) # encoding: [0x09,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %or = or i32 %t, %b @@ -561,6 +848,11 @@ define void @or64mr_legacy(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orq %rsi, (%rdi) # encoding: [0x48,0x09,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or64mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: orq %rsi, (%rdi) # encoding: [0x48,0x09,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %or = or i64 %t, %b @@ -573,6 +865,11 @@ define void @or8mi_legacy(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: orb $123, (%rdi) # encoding: [0x80,0x0f,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or8mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: orb $123, (%rdi) # encoding: [0x80,0x0f,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %or = or i8 %t, 123 @@ -586,6 +883,12 @@ define void @or16mi_legacy(ptr %a) { ; CHECK-NEXT: orw $1234, (%rdi) # encoding: [0x66,0x81,0x0f,0xd2,0x04] ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or16mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: orw $1234, (%rdi) # encoding: [0x66,0x81,0x0f,0xd2,0x04] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %or = or i16 %t, 1234 @@ -599,6 +902,12 @@ define void @or32mi_legacy(ptr %a) { ; CHECK-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or32mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %or = or i32 %t, 123456 @@ -612,6 +921,12 @@ define void @or64mi_legacy(ptr %a) { ; CHECK-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: or64mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %or = or i64 %t, 123456 diff --git a/llvm/test/CodeGen/X86/apx/shl.ll b/llvm/test/CodeGen/X86/apx/shl.ll index 869caf932ff920..35b6cb27254b2e 100644 --- a/llvm/test/CodeGen/X86/apx/shl.ll +++ b/llvm/test/CodeGen/X86/apx/shl.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s define i8 @shl8ri(i8 noundef %a) { ; CHECK-LABEL: shl8ri: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlb $4, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xe7,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shlb $4, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0xe7,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i8 %a, 4 ret i8 %shl @@ -17,6 +23,12 @@ define i16 @shl16ri(i16 noundef %a) { ; CHECK-NEXT: shll $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xe7,0x04] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shll $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xe7,0x04] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i16 %a, 4 ret i16 %shl @@ -27,6 +39,11 @@ define i32 @shl32ri(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shll $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xe7,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shll $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xe7,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i32 %a, 4 ret i32 %shl @@ -37,6 +54,11 @@ define i64 @shl64ri(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlq $4, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xe7,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shlq $4, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0xe7,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i64 %a, 4 ret i64 %shl @@ -48,6 +70,12 @@ define i8 @shl8m1(ptr %ptr) { ; CHECK-NEXT: movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07] ; CHECK-NEXT: addb %al, %al # EVEX TO LEGACY Compression encoding: [0x00,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8m1: +; NF: # %bb.0: # %entry +; NF-NEXT: movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07] +; NF-NEXT: addb %al, %al # EVEX TO LEGACY Compression encoding: [0x00,0xc0] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shl = shl i8 %a, 1 @@ -61,6 +89,13 @@ define i16 @shl16m1(ptr %ptr) { ; CHECK-NEXT: addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16m1: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shl = shl i16 %a, 1 @@ -73,6 +108,12 @@ define i32 @shl32m1(ptr %ptr) { ; CHECK-NEXT: movl (%rdi), %eax # encoding: [0x8b,0x07] ; CHECK-NEXT: addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32m1: +; NF: # %bb.0: # %entry +; NF-NEXT: movl (%rdi), %eax # encoding: [0x8b,0x07] +; NF-NEXT: addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shl = shl i32 %a, 1 @@ -85,6 +126,12 @@ define i64 @shl64m1(ptr %ptr) { ; CHECK-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07] ; CHECK-NEXT: addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64m1: +; NF: # %bb.0: # %entry +; NF-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07] +; NF-NEXT: addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shl = shl i64 %a, 1 @@ -98,6 +145,13 @@ define i8 @shl8mcl(ptr %ptr, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8mcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shlb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shl = shl i8 %a, %cl @@ -111,6 +165,13 @@ define i8 @shl8mcl_mask(ptr %ptr, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8mcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shlb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shamt = and i8 %cl, 31 @@ -127,6 +188,15 @@ define i16 @shl16mcl(ptr %ptr, i16 %cl) { ; CHECK-NEXT: shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16mcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shl = shl i16 %a, %cl @@ -142,6 +212,15 @@ define i16 @shl16mcl_mask(ptr %ptr, i16 %cl) { ; CHECK-NEXT: shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16mcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shamt = and i16 %cl, 31 @@ -156,6 +235,13 @@ define i32 @shl32mcl(ptr %ptr, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32mcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shll %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shl = shl i32 %a, %cl @@ -169,6 +255,13 @@ define i32 @shl32mcl_mask(ptr %ptr, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32mcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shll %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shamt = and i32 %cl, 31 @@ -183,6 +276,13 @@ define i64 @shl64mcl(ptr %ptr, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64mcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: {nf} shlq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shl = shl i64 %a, %cl @@ -196,6 +296,13 @@ define i64 @shl64mcl_mask(ptr %ptr, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64mcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: {nf} shlq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shamt = and i64 %cl, 63 @@ -208,6 +315,11 @@ define i8 @shl8mi(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x27,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shlb $4, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0x27,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shl = shl i8 %a, 4 @@ -221,6 +333,13 @@ define i16 @shl16mi(ptr %ptr) { ; CHECK-NEXT: shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16mi: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shl = shl i16 %a, 4 @@ -232,6 +351,11 @@ define i32 @shl32mi(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shll $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x27,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shll $4, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0x27,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shl = shl i32 %a, 4 @@ -243,6 +367,11 @@ define i64 @shl64mi(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x27,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shlq $4, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0x27,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shl = shl i64 %a, 4 @@ -254,6 +383,11 @@ define i8 @shl8r1(i8 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb %dil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xff] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8r1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addb %dil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x00,0xff] +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i8 %a, 1 ret i8 %shl @@ -265,6 +399,12 @@ define i16 @shl16r1(i16 noundef %a) { ; CHECK-NEXT: addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16r1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl %edi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xff] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i16 %a, 1 ret i16 %shl @@ -275,6 +415,11 @@ define i32 @shl32r1(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32r1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl %edi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xff] +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i32 %a, 1 ret i32 %shl @@ -285,6 +430,11 @@ define i64 @shl64r1(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addq %rdi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xff] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64r1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addq %rdi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0xff] +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i64 %a, 1 ret i64 %shl @@ -297,6 +447,13 @@ define i8 @shl8rcl(i8 noundef %a, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shlb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xe7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8rcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shlb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xe7] +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i8 %a, %cl ret i8 %shl @@ -309,6 +466,13 @@ define i8 @shl8rcl_mask(i8 noundef %a, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shlb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xe7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8rcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shlb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xe7] +; NF-NEXT: retq # encoding: [0xc3] entry: %shamt = and i8 %cl, 31 %shl = shl i8 %a, %shamt @@ -323,6 +487,14 @@ define i16 @shl16rcl(i16 noundef %a, i16 %cl) { ; CHECK-NEXT: shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16rcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i16 %a, %cl ret i16 %shl @@ -336,6 +508,14 @@ define i16 @shl16rcl_mask(i16 noundef %a, i16 %cl) { ; CHECK-NEXT: shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16rcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %shamt = and i16 %cl, 31 %shl = shl i16 %a, %shamt @@ -349,6 +529,13 @@ define i32 @shl32rcl(i32 noundef %a, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32rcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7] +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i32 %a, %cl ret i32 %shl @@ -361,6 +548,13 @@ define i32 @shl32rcl_mask(i32 noundef %a, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32rcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7] +; NF-NEXT: retq # encoding: [0xc3] entry: %shamt = and i32 %cl, 31 %shl = shl i32 %a, %shamt @@ -374,6 +568,13 @@ define i64 @shl64rcl(i64 noundef %a, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shlq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xe7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64rcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: {nf} shlq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xe7] +; NF-NEXT: retq # encoding: [0xc3] entry: %shl = shl i64 %a, %cl ret i64 %shl @@ -386,6 +587,13 @@ define i64 @shl64rcl_mask(i64 noundef %a, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shlq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xe7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64rcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: {nf} shlq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xe7] +; NF-NEXT: retq # encoding: [0xc3] entry: %shamt = and i64 %cl, 63 %shl = shl i64 %a, %shamt @@ -397,6 +605,11 @@ define void @shl8m1_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlb (%rdi) # encoding: [0xd0,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8m1_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shlb (%rdi) # encoding: [0xd0,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shl = shl i8 %a, 1 @@ -409,6 +622,11 @@ define void @shl16m1_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlw (%rdi) # encoding: [0x66,0xd1,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16m1_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shlw (%rdi) # encoding: [0x66,0xd1,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shl = shl i16 %a, 1 @@ -421,6 +639,11 @@ define void @shl32m1_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shll (%rdi) # encoding: [0xd1,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32m1_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shll (%rdi) # encoding: [0xd1,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shl = shl i32 %a, 1 @@ -433,6 +656,11 @@ define void @shl64m1_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlq (%rdi) # encoding: [0x48,0xd1,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64m1_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shlq (%rdi) # encoding: [0x48,0xd1,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shl = shl i64 %a, 1 @@ -445,6 +673,11 @@ define void @shl8mi_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlb $4, (%rdi) # encoding: [0xc0,0x27,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shlb $4, (%rdi) # encoding: [0xc0,0x27,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shl = shl i8 %a, 4 @@ -457,6 +690,11 @@ define void @shl16mi_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlw $4, (%rdi) # encoding: [0x66,0xc1,0x27,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shlw $4, (%rdi) # encoding: [0x66,0xc1,0x27,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shl = shl i16 %a, 4 @@ -469,6 +707,11 @@ define void @shl32mi_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shll $4, (%rdi) # encoding: [0xc1,0x27,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shll $4, (%rdi) # encoding: [0xc1,0x27,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shl = shl i32 %a, 4 @@ -481,6 +724,11 @@ define void @shl64mi_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shlq $4, (%rdi) # encoding: [0x48,0xc1,0x27,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shlq $4, (%rdi) # encoding: [0x48,0xc1,0x27,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shl = shl i64 %a, 4 @@ -495,6 +743,13 @@ define void @shl8mcl_legacy(ptr %ptr, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shlb %cl, (%rdi) # encoding: [0xd2,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl8mcl_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shlb %cl, (%rdi) # encoding: [0xd2,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shl = shl i8 %a, %cl @@ -509,6 +764,13 @@ define void @shl16mcl_legacy(ptr %ptr, i16 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shlw %cl, (%rdi) # encoding: [0x66,0xd3,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl16mcl_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shlw %cl, (%rdi) # encoding: [0x66,0xd3,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shl = shl i16 %a, %cl @@ -523,6 +785,13 @@ define void @shl32mcl_legacy(ptr %ptr, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, (%rdi) # encoding: [0xd3,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl32mcl_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shll %cl, (%rdi) # encoding: [0xd3,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shl = shl i32 %a, %cl @@ -537,6 +806,13 @@ define void @shl64mcl_legacy(ptr %ptr, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shlq %cl, (%rdi) # encoding: [0x48,0xd3,0x27] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shl64mcl_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: shlq %cl, (%rdi) # encoding: [0x48,0xd3,0x27] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shl = shl i64 %a, %cl diff --git a/llvm/test/CodeGen/X86/apx/shr.ll b/llvm/test/CodeGen/X86/apx/shr.ll index a7e02d8586f49d..b5b91b02fedffb 100644 --- a/llvm/test/CodeGen/X86/apx/shr.ll +++ b/llvm/test/CodeGen/X86/apx/shr.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s define i8 @shr8m1(ptr %ptr) { ; CHECK-LABEL: shr8m1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8m1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrb (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd0,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shr = lshr i8 %a, 1 @@ -19,6 +25,13 @@ define i16 @shr16m1(ptr %ptr) { ; CHECK-NEXT: shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16m1: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shr = lshr i16 %a, 1 @@ -30,6 +43,11 @@ define i32 @shr32m1(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32m1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrl (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd1,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shr = lshr i32 %a, 1 @@ -41,6 +59,11 @@ define i64 @shr64m1(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64m1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrq (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd1,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shr = lshr i64 %a, 1 @@ -54,6 +77,13 @@ define i8 @shr8mcl(ptr %ptr, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8mcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shrb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shr = lshr i8 %a, %cl @@ -67,6 +97,13 @@ define i8 @shr8mcl_mask(ptr %ptr, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8mcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shrb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shamt = and i8 %cl, 31 @@ -83,6 +120,15 @@ define i16 @shr16mcl(ptr %ptr, i16 %cl) { ; CHECK-NEXT: shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16mcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shr = lshr i16 %a, %cl @@ -98,6 +144,15 @@ define i16 @shr16mcl_mask(ptr %ptr, i16 %cl) { ; CHECK-NEXT: shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16mcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shamt = and i16 %cl, 31 @@ -112,6 +167,13 @@ define i32 @shr32mcl(ptr %ptr, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32mcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shrl %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shr = lshr i32 %a, %cl @@ -125,6 +187,13 @@ define i32 @shr32mcl_mask(ptr %ptr, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32mcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shrl %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shamt = and i32 %cl, 31 @@ -139,6 +208,13 @@ define i64 @shr64mcl(ptr %ptr, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64mcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: {nf} shrq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shr = lshr i64 %a, %cl @@ -152,6 +228,13 @@ define i64 @shr64mcl_mask(ptr %ptr, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64mcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: {nf} shrq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shamt = and i64 %cl, 63 @@ -164,6 +247,11 @@ define i8 @shr8mi(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x2f,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrb $4, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0x2f,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shr = lshr i8 %a, 4 @@ -177,6 +265,13 @@ define i16 @shr16mi(ptr %ptr) { ; CHECK-NEXT: shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16mi: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shr = lshr i16 %a, 4 @@ -188,6 +283,11 @@ define i32 @shr32mi(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrl $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x2f,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrl $4, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0x2f,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shr = lshr i32 %a, 4 @@ -199,6 +299,11 @@ define i64 @shr64mi(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x2f,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrq $4, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0x2f,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shr = lshr i64 %a, 4 @@ -210,6 +315,11 @@ define i8 @shr8r1(i8 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrb %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0xef] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8r1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrb %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd0,0xef] +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i8 %a, 1 ret i8 %shr @@ -222,6 +332,13 @@ define i16 @shr16r1(i16 noundef %a) { ; CHECK-NEXT: shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16r1: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7] +; NF-NEXT: shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i16 %a, 1 ret i16 %shr @@ -232,6 +349,11 @@ define i32 @shr32r1(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrl %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0xef] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32r1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrl %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd1,0xef] +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i32 %a, 1 ret i32 %shr @@ -242,6 +364,11 @@ define i64 @shr64r1(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrq %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0xef] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64r1: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrq %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd1,0xef] +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i64 %a, 1 ret i64 %shr @@ -254,6 +381,13 @@ define i8 @shr8rcl(i8 noundef %a, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xef] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8rcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shrb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xef] +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i8 %a, %cl ret i8 %shr @@ -266,6 +400,13 @@ define i8 @shr8rcl_mask(i8 noundef %a, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xef] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8rcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shrb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xef] +; NF-NEXT: retq # encoding: [0xc3] entry: %shamt = and i8 %cl, 31 %shr = lshr i8 %a, %shamt @@ -281,6 +422,15 @@ define i16 @shr16rcl(i16 noundef %a, i16 %cl) { ; CHECK-NEXT: shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16rcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i16 %a, %cl ret i16 %shr @@ -295,6 +445,15 @@ define i16 @shr16rcl_mask(i16 noundef %a, i16 %cl) { ; CHECK-NEXT: shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16rcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %shamt = and i16 %cl, 31 %shr = lshr i16 %a, %shamt @@ -308,6 +467,13 @@ define i32 @shr32rcl(i32 noundef %a, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xef] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32rcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shrl %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xef] +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i32 %a, %cl ret i32 %shr @@ -320,6 +486,13 @@ define i32 @shr32rcl_mask(i32 noundef %a, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xef] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32rcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: {nf} shrl %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xef] +; NF-NEXT: retq # encoding: [0xc3] entry: %shamt = and i32 %cl, 31 %shr = lshr i32 %a, %shamt @@ -333,6 +506,13 @@ define i64 @shr64rcl(i64 noundef %a, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shrq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xef] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64rcl: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: {nf} shrq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xef] +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i64 %a, %cl ret i64 %shr @@ -345,6 +525,13 @@ define i64 @shr64rcl_mask(i64 noundef %a, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shrq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xef] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64rcl_mask: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: {nf} shrq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xef] +; NF-NEXT: retq # encoding: [0xc3] entry: %shamt = and i64 %cl, 63 %shr = lshr i64 %a, %shamt @@ -356,6 +543,11 @@ define i8 @shr8ri(i8 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrb $4, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xef,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrb $4, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0xef,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i8 %a, 4 ret i8 %shr @@ -368,6 +560,13 @@ define i16 @shr16ri(i16 noundef %a) { ; CHECK-NEXT: shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16ri: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7] +; NF-NEXT: shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i16 %a, 4 ret i16 %shr @@ -378,6 +577,11 @@ define i32 @shr32ri(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrl $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xef,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrl $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xef,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i32 %a, 4 ret i32 %shr @@ -388,6 +592,11 @@ define i64 @shr64ri(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrq $4, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xef,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} shrq $4, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0xef,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %shr = lshr i64 %a, 4 ret i64 %shr @@ -398,6 +607,11 @@ define void @shr8m1_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrb (%rdi) # encoding: [0xd0,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8m1_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shrb (%rdi) # encoding: [0xd0,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shr = lshr i8 %a, 1 @@ -410,6 +624,11 @@ define void @shr16m1_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrw (%rdi) # encoding: [0x66,0xd1,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16m1_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shrw (%rdi) # encoding: [0x66,0xd1,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shr = lshr i16 %a, 1 @@ -422,6 +641,11 @@ define void @shr32m1_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrl (%rdi) # encoding: [0xd1,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32m1_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shrl (%rdi) # encoding: [0xd1,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shr = lshr i32 %a, 1 @@ -434,6 +658,11 @@ define void @shr64m1_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrq (%rdi) # encoding: [0x48,0xd1,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64m1_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shrq (%rdi) # encoding: [0x48,0xd1,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shr = lshr i64 %a, 1 @@ -446,6 +675,11 @@ define void @shr8mi_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrb $4, (%rdi) # encoding: [0xc0,0x2f,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shrb $4, (%rdi) # encoding: [0xc0,0x2f,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shr = lshr i8 %a, 4 @@ -458,6 +692,11 @@ define void @shr16mi_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrw $4, (%rdi) # encoding: [0x66,0xc1,0x2f,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shrw $4, (%rdi) # encoding: [0x66,0xc1,0x2f,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shr = lshr i16 %a, 4 @@ -470,6 +709,11 @@ define void @shr32mi_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrl $4, (%rdi) # encoding: [0xc1,0x2f,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shrl $4, (%rdi) # encoding: [0xc1,0x2f,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shr = lshr i32 %a, 4 @@ -482,6 +726,11 @@ define void @shr64mi_legacy(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: shrq $4, (%rdi) # encoding: [0x48,0xc1,0x2f,0x04] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: shrq $4, (%rdi) # encoding: [0x48,0xc1,0x2f,0x04] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shr = lshr i64 %a, 4 @@ -496,6 +745,13 @@ define void @shr8mcl_legacy(ptr %ptr, i8 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrb %cl, (%rdi) # encoding: [0xd2,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr8mcl_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shrb %cl, (%rdi) # encoding: [0xd2,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i8, ptr %ptr %shr = lshr i8 %a, %cl @@ -510,6 +766,13 @@ define void @shr16mcl_legacy(ptr %ptr, i16 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrw %cl, (%rdi) # encoding: [0x66,0xd3,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr16mcl_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shrw %cl, (%rdi) # encoding: [0x66,0xd3,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i16, ptr %ptr %shr = lshr i16 %a, %cl @@ -524,6 +787,13 @@ define void @shr32mcl_legacy(ptr %ptr, i32 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrl %cl, (%rdi) # encoding: [0xd3,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr32mcl_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: movl %esi, %ecx # encoding: [0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $ecx +; NF-NEXT: shrl %cl, (%rdi) # encoding: [0xd3,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i32, ptr %ptr %shr = lshr i32 %a, %cl @@ -538,6 +808,13 @@ define void @shr64mcl_legacy(ptr %ptr, i64 %cl) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shrq %cl, (%rdi) # encoding: [0x48,0xd3,0x2f] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: shr64mcl_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: movq %rsi, %rcx # encoding: [0x48,0x89,0xf1] +; NF-NEXT: # kill: def $cl killed $cl killed $rcx +; NF-NEXT: shrq %cl, (%rdi) # encoding: [0x48,0xd3,0x2f] +; NF-NEXT: retq # encoding: [0xc3] entry: %a = load i64, ptr %ptr %shr = lshr i64 %a, %cl diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll index be0914c90b9faf..a38d09587ba919 100644 --- a/llvm/test/CodeGen/X86/apx/sub.ll +++ b/llvm/test/CodeGen/X86/apx/sub.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s define i8 @sub8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-LABEL: sub8rr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub8rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x28,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i8 %a, %b ret i8 %sub @@ -17,6 +23,12 @@ define i16 @sub16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-NEXT: subl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub16rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0xf7] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i16 %a, %b ret i16 %sub @@ -27,6 +39,11 @@ define i32 @sub32rr(i32 noundef %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub32rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i32 %a, %b ret i32 %sub @@ -37,6 +54,11 @@ define i64 @sub64rr(i64 noundef %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i64 %a, %b ret i64 %sub @@ -47,6 +69,11 @@ define i8 @sub8rm(i8 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x2a,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub8rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x2a,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %b = load i8, ptr %ptr %sub = sub i8 %a, %b @@ -58,6 +85,11 @@ define i16 @sub16rm(i16 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x2b,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub16rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x2b,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %b = load i16, ptr %ptr %sub = sub i16 %a, %b @@ -69,6 +101,11 @@ define i32 @sub32rm(i32 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x2b,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub32rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x2b,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %b = load i32, ptr %ptr %sub = sub i32 %a, %b @@ -80,6 +117,11 @@ define i64 @sub64rm(i64 noundef %a, ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x2b,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x2b,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %b = load i64, ptr %ptr %sub = sub i64 %a, %b @@ -92,6 +134,12 @@ define i16 @sub16ri8(i16 noundef %a) { ; CHECK-NEXT: subl $-128, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xef,0x80] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub16ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subl $-128, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xef,0x80] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i16 %a, -128 ret i16 %sub @@ -102,6 +150,11 @@ define i32 @sub32ri8(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subl $-128, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xef,0x80] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub32ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subl $-128, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xef,0x80] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i32 %a, -128 ret i32 %sub @@ -112,6 +165,11 @@ define i64 @sub64ri8(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $-128, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xef,0x80] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subq $-128, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xef,0x80] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i64 %a, -128 ret i64 %sub @@ -122,6 +180,11 @@ define i8 @sub8ri(i8 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb $-123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x85] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub8ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addb $-123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xc7,0x85] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i8 %a, 123 ret i8 %sub @@ -134,6 +197,13 @@ define i16 @sub16ri(i16 noundef %a) { ; CHECK-NEXT: # imm = 0xFB2E ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub16ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl $-1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0x2e,0xfb,0xff,0xff] +; NF-NEXT: # imm = 0xFB2E +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i16 %a, 1234 ret i16 %sub @@ -145,6 +215,12 @@ define i32 @sub32ri(i32 noundef %a) { ; CHECK-NEXT: addl $-123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0xc0,0x1d,0xfe,0xff] ; CHECK-NEXT: # imm = 0xFFFE1DC0 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub32ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl $-123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0xc0,0x1d,0xfe,0xff] +; NF-NEXT: # imm = 0xFFFE1DC0 +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i32 %a, 123456 ret i32 %sub @@ -156,6 +232,12 @@ define i64 @sub64ri(i64 noundef %a) { ; CHECK-NEXT: subq $-2147483648, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xef,0x00,0x00,0x00,0x80] ; CHECK-NEXT: # imm = 0x80000000 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subq $-2147483648, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xef,0x00,0x00,0x00,0x80] +; NF-NEXT: # imm = 0x80000000 +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = sub i64 %a, -2147483648 ret i64 %sub @@ -166,6 +248,11 @@ define i8 @sub8mr(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub8mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x28,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %sub = sub nsw i8 %t, %b @@ -179,6 +266,13 @@ define i16 @sub16mr(ptr %a, i16 noundef %b) { ; CHECK-NEXT: subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub16mr: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %sub = sub nsw i16 %t, %b @@ -190,6 +284,11 @@ define i32 @sub32mr(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub32mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %sub = sub nsw i32 %t, %b @@ -201,6 +300,11 @@ define i64 @sub64mr(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %sub = sub nsw i64 %t, %b @@ -214,6 +318,13 @@ define i16 @sub16mi8(ptr %a) { ; CHECK-NEXT: subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub16mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %sub = sub nsw i16 %t, -128 @@ -225,6 +336,11 @@ define i32 @sub32mi8(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subl $-128, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x2f,0x80] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub32mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subl $-128, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x2f,0x80] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %sub = sub nsw i32 %t, -128 @@ -236,6 +352,11 @@ define i64 @sub64mi8(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $-128, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x2f,0x80] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subq $-128, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x2f,0x80] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %sub = sub nsw i64 %t, -128 @@ -247,6 +368,11 @@ define i8 @sub8mi(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb $-123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x85] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub8mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addb $-123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x07,0x85] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %sub = sub nsw i8 %t, 123 @@ -261,6 +387,14 @@ define i16 @sub16mi(ptr %a) { ; CHECK-NEXT: # imm = 0xFB2E ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub16mi: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: addl $-1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x2e,0xfb,0xff,0xff] +; NF-NEXT: # imm = 0xFB2E +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %sub = sub nsw i16 %t, 1234 @@ -273,6 +407,12 @@ define i32 @sub32mi(ptr %a) { ; CHECK-NEXT: addl $-123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0xc0,0x1d,0xfe,0xff] ; CHECK-NEXT: # imm = 0xFFFE1DC0 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub32mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} addl $-123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x07,0xc0,0x1d,0xfe,0xff] +; NF-NEXT: # imm = 0xFFFE1DC0 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %sub = sub nsw i32 %t, 123456 @@ -285,6 +425,12 @@ define i64 @sub64mi(ptr %a) { ; CHECK-NEXT: subq $-2147483648, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x2f,0x00,0x00,0x00,0x80] ; CHECK-NEXT: # imm = 0x80000000 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} subq $-2147483648, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x2f,0x00,0x00,0x00,0x80] +; NF-NEXT: # imm = 0x80000000 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %sub = sub nsw i64 %t, -2147483648 @@ -305,6 +451,15 @@ define i8 @subflag8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag8rr: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subb %sil, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x28,0xf7] +; NF-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i8 @llvm.usub.sat.i8(i8 %a, i8 %b) ret i8 %sub @@ -318,6 +473,14 @@ define i16 @subflag16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag16rr: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subw %si, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x29,0xf7] +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 %b) ret i16 %sub @@ -330,6 +493,13 @@ define i32 @subflag32rr(i32 noundef %a, i32 noundef %b) { ; CHECK-NEXT: subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7] ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag32rr: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7] +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 %b) ret i32 %sub @@ -342,6 +512,13 @@ define i64 @subflag64rr(i64 noundef %a, i64 noundef %b) { ; CHECK-NEXT: subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7] ; CHECK-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag64rr: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7] +; NF-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 %b) ret i64 %sub @@ -356,6 +533,15 @@ define i8 @subflag8rm(i8 noundef %a, ptr %b) { ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag8rm: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subb (%rsi), %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x2a,0x3e] +; NF-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i8, ptr %b %sub = call i8 @llvm.usub.sat.i8(i8 %a, i8 %t) @@ -370,6 +556,14 @@ define i16 @subflag16rm(i16 noundef %a, ptr %b) { ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag16rm: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subw (%rsi), %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x2b,0x3e] +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i16, ptr %b %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 %t) @@ -383,6 +577,13 @@ define i32 @subflag32rm(i32 noundef %a, ptr %b) { ; CHECK-NEXT: subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e] ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag32rm: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e] +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i32, ptr %b %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 %t) @@ -396,6 +597,13 @@ define i64 @subflag64rm(i64 noundef %a, ptr %b) { ; CHECK-NEXT: subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e] ; CHECK-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag64rm: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e] +; NF-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i64, ptr %b %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 %t) @@ -410,6 +618,14 @@ define i16 @subflag16ri8(i16 noundef %a) { ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag16ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subw $123, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xef,0x7b] +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 123) ret i16 %sub @@ -422,6 +638,13 @@ define i32 @subflag32ri8(i32 noundef %a) { ; CHECK-NEXT: subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b] ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag32ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b] +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123) ret i32 %sub @@ -434,6 +657,13 @@ define i64 @subflag64ri8(i64 noundef %a) { ; CHECK-NEXT: subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b] ; CHECK-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag64ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b] +; NF-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123) ret i64 %sub @@ -448,6 +678,15 @@ define i8 @subflag8ri(i8 noundef %a) { ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag8ri: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subb $123, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xef,0x7b] +; NF-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i8 @llvm.usub.sat.i8(i8 %a, i8 123) ret i8 %sub @@ -462,6 +701,15 @@ define i16 @subflag16ri(i16 noundef %a) { ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag16ri: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subw $1234, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xef,0xd2,0x04] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 1234) ret i16 %sub @@ -475,6 +723,14 @@ define i32 @subflag32ri(i32 noundef %a) { ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag32ri: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xef,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123456) ret i32 %sub @@ -488,6 +744,14 @@ define i64 @subflag64ri(i64 noundef %a) { ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: subflag64ri: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NF-NEXT: subq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xef,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123456) ret i64 %sub @@ -513,6 +777,22 @@ define void @sub64ri_reloc(i64 %val) { ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: .LBB41_2: # %f ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64ri_reloc: +; NF: # %bb.0: +; NF-NEXT: cmpq $val, %rdi # encoding: [0x48,0x81,0xff,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: val, kind: reloc_signed_4byte +; NF-NEXT: jbe .LBB41_2 # encoding: [0x76,A] +; NF-NEXT: # fixup A - offset: 1, value: .LBB41_2-1, kind: FK_PCRel_1 +; NF-NEXT: # %bb.1: # %t +; NF-NEXT: pushq %rax # encoding: [0x50] +; NF-NEXT: .cfi_def_cfa_offset 16 +; NF-NEXT: callq f@PLT # encoding: [0xe8,A,A,A,A] +; NF-NEXT: # fixup A - offset: 1, value: f@PLT-4, kind: FK_PCRel_4 +; NF-NEXT: popq %rax # encoding: [0x58] +; NF-NEXT: .cfi_def_cfa_offset 8 +; NF-NEXT: .LBB41_2: # %f +; NF-NEXT: retq # encoding: [0xc3] %cmp = icmp ugt i64 %val, ptrtoint (ptr @val to i64) br i1 %cmp, label %t, label %f @@ -529,6 +809,11 @@ define void @sub8mr_legacy(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subb %sil, (%rdi) # encoding: [0x40,0x28,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub8mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: subb %sil, (%rdi) # encoding: [0x40,0x28,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %sub = sub i8 %t, %b @@ -541,6 +826,11 @@ define void @sub16mr_legacy(ptr %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subw %si, (%rdi) # encoding: [0x66,0x29,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub16mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: subw %si, (%rdi) # encoding: [0x66,0x29,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %sub = sub i16 %t, %b @@ -553,6 +843,11 @@ define void @sub32mr_legacy(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subl %esi, (%rdi) # encoding: [0x29,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub32mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: subl %esi, (%rdi) # encoding: [0x29,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %sub = sub i32 %t, %b @@ -565,6 +860,11 @@ define void @sub64mr_legacy(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq %rsi, (%rdi) # encoding: [0x48,0x29,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: subq %rsi, (%rdi) # encoding: [0x48,0x29,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %sub = sub i64 %t, %b @@ -577,6 +877,11 @@ define void @sub8mi_legacy(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addb $-123, (%rdi) # encoding: [0x80,0x07,0x85] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub8mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addb $-123, (%rdi) # encoding: [0x80,0x07,0x85] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %sub = sub nsw i8 %t, 123 @@ -590,6 +895,12 @@ define void @sub16mi_legacy(ptr %a) { ; CHECK-NEXT: addw $-1234, (%rdi) # encoding: [0x66,0x81,0x07,0x2e,0xfb] ; CHECK-NEXT: # imm = 0xFB2E ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub16mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addw $-1234, (%rdi) # encoding: [0x66,0x81,0x07,0x2e,0xfb] +; NF-NEXT: # imm = 0xFB2E +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %sub = sub nsw i16 %t, 1234 @@ -603,6 +914,12 @@ define void @sub32mi_legacy(ptr %a) { ; CHECK-NEXT: addl $-123456, (%rdi) # encoding: [0x81,0x07,0xc0,0x1d,0xfe,0xff] ; CHECK-NEXT: # imm = 0xFFFE1DC0 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub32mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addl $-123456, (%rdi) # encoding: [0x81,0x07,0xc0,0x1d,0xfe,0xff] +; NF-NEXT: # imm = 0xFFFE1DC0 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %sub = sub nsw i32 %t, 123456 @@ -616,6 +933,12 @@ define void @sub64mi_legacy(ptr %a) { ; CHECK-NEXT: addq $-123456, (%rdi) # encoding: [0x48,0x81,0x07,0xc0,0x1d,0xfe,0xff] ; CHECK-NEXT: # imm = 0xFFFE1DC0 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: sub64mi_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: addq $-123456, (%rdi) # encoding: [0x48,0x81,0x07,0xc0,0x1d,0xfe,0xff] +; NF-NEXT: # imm = 0xFFFE1DC0 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %sub = sub nsw i64 %t, 123456 diff --git a/llvm/test/CodeGen/X86/apx/xor.ll b/llvm/test/CodeGen/X86/apx/xor.ll index d203fbb02782ab..436b16b4292dfc 100644 --- a/llvm/test/CodeGen/X86/apx/xor.ll +++ b/llvm/test/CodeGen/X86/apx/xor.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s define i8 @xor8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-LABEL: xor8rr: @@ -7,6 +8,12 @@ define i8 @xor8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-NEXT: xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor8rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7] +; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i8 %a, %b ret i8 %xor @@ -18,6 +25,12 @@ define i16 @xor16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-NEXT: xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor16rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i16 %a, %b ret i16 %xor @@ -28,6 +41,11 @@ define i32 @xor32rr(i32 noundef %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor32rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i32 %a, %b ret i32 %xor @@ -38,6 +56,11 @@ define i64 @xor64rr(i64 noundef %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x31,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor64rr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x31,0xf7] +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i64 %a, %b ret i64 %xor @@ -48,6 +71,11 @@ define i8 @xor8rm(i8 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x32,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor8rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x32,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i8, ptr %b %xor = xor i8 %a, %t @@ -59,6 +87,11 @@ define i16 @xor16rm(i16 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x33,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor16rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x33,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i16, ptr %b %xor = xor i16 %a, %t @@ -70,6 +103,11 @@ define i32 @xor32rm(i32 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x33,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor32rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x33,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i32, ptr %b %xor = xor i32 %a, %t @@ -81,6 +119,11 @@ define i64 @xor64rm(i64 noundef %a, ptr %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x33,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor64rm: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x33,0x3e] +; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i64, ptr %b %xor = xor i64 %a, %t @@ -93,6 +136,12 @@ define i16 @xor16ri8(i16 noundef %a) { ; CHECK-NEXT: xorl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xf7,0x7b] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor16ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xf7,0x7b] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i16 %a, 123 ret i16 %xor @@ -103,6 +152,11 @@ define i32 @xor32ri8(i32 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xf7,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor32ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xf7,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i32 %a, 123 ret i32 %xor @@ -113,6 +167,11 @@ define i64 @xor64ri8(i64 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xf7,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor64ri8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xf7,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i64 %a, 123 ret i64 %xor @@ -123,6 +182,11 @@ define i8 @xor8ri(i8 noundef %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xf7,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor8ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xf7,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i8 %a, 123 ret i8 %xor @@ -135,6 +199,13 @@ define i16 @xor16ri(i16 noundef %a) { ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor16ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xf7,0xd2,0x04,0x00,0x00] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i16 %a, 1234 ret i16 %xor @@ -146,6 +217,12 @@ define i32 @xor32ri(i32 noundef %a) { ; CHECK-NEXT: xorl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor32ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xf7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i32 %a, 123456 ret i32 %xor @@ -157,6 +234,12 @@ define i64 @xor64ri(i64 noundef %a) { ; CHECK-NEXT: xorq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor64ri: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xf7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i64 %a, 123456 ret i64 %xor @@ -167,6 +250,11 @@ define i8 @xor8mr(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x30,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor8mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x30,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %xor = xor i8 %t, %b @@ -178,6 +266,11 @@ define i16 @xor16mr(ptr %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x31,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor16mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x31,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %xor = xor i16 %t, %b @@ -189,6 +282,11 @@ define i32 @xor32mr(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor32mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %xor = xor i32 %t, %b @@ -200,6 +298,11 @@ define i64 @xor64mr(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x31,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor64mr: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x31,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %xor = xor i64 %t, %b @@ -213,6 +316,13 @@ define i16 @xor16mi8(ptr %a) { ; CHECK-NEXT: xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor16mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b] +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %xor = xor i16 %t, 123 @@ -224,6 +334,11 @@ define i32 @xor32mi8(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x37,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor32mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x37,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %xor = xor i32 %t, 123 @@ -235,6 +350,11 @@ define i64 @xor64mi8(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x37,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor64mi8: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x37,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %xor = xor i64 %t, 123 @@ -246,6 +366,11 @@ define i8 @xor8mi(ptr %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x37,0x7b] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor8mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x37,0x7b] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %xor = xor i8 %t, 123 @@ -260,6 +385,14 @@ define i16 @xor16mi(ptr %a) { ; CHECK-NEXT: # imm = 0x4D2 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor16mi: +; NF: # %bb.0: # %entry +; NF-NEXT: movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07] +; NF-NEXT: xorl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x35,0xd2,0x04,0x00,0x00] +; NF-NEXT: # imm = 0x4D2 +; NF-NEXT: # kill: def $ax killed $ax killed $eax +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %xor = xor i16 %t, 1234 @@ -272,6 +405,12 @@ define i32 @xor32mi(ptr %a) { ; CHECK-NEXT: xorl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x37,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor32mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x37,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %xor = xor i32 %t, 123456 @@ -284,6 +423,12 @@ define i64 @xor64mi(ptr %a) { ; CHECK-NEXT: xorq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x37,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor64mi: +; NF: # %bb.0: # %entry +; NF-NEXT: {nf} xorq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x37,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %xor = xor i64 %t, 123456 @@ -301,6 +446,15 @@ define i1 @xorflag8rr(i8 %a, i8 %b) { ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag8rr: +; NF: # %bb.0: +; NF-NEXT: {nf} xorl %edi, %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xfe] +; NF-NEXT: xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i8 %b, -1 %v0 = xor i8 %a, %xor ; 0xff << 50 %v1 = icmp eq i8 %v0, 0 @@ -317,6 +471,15 @@ define i1 @xorflag16rr(i16 %a, i16 %b) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag16rr: +; NF: # %bb.0: +; NF-NEXT: {nf} xorl %edi, %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xfe] +; NF-NEXT: xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 %b, -1 %v0 = xor i16 %a, %xor ; 0xff << 50 %v1 = icmp eq i16 %v0, 0 @@ -332,6 +495,14 @@ define i1 @xorflag32rr(i32 %a, i32 %b) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag32rr: +; NF: # %bb.0: +; NF-NEXT: xorl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x31,0xf7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i32 %a, %b ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 store i32 %v0, ptr @d64 @@ -346,6 +517,14 @@ define i1 @xorflag64rr(i64 %a, i64 %b) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag64rr: +; NF: # %bb.0: +; NF-NEXT: xorq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x31,0xf7] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i64 %a, %b ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 store i64 %v0, ptr @d64 @@ -361,6 +540,15 @@ define i1 @xorflag8rm(ptr %ptr, i8 %b) { ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag8rm: +; NF: # %bb.0: +; NF-NEXT: {nf} xorb (%rdi), %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x32,0x37] +; NF-NEXT: xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i8, ptr %ptr %xor = xor i8 %b, -1 %v0 = xor i8 %a, %xor ; 0xff << 50 @@ -378,6 +566,15 @@ define i1 @xorflag16rm(ptr %ptr, i16 %b) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag16rm: +; NF: # %bb.0: +; NF-NEXT: {nf} xorw (%rdi), %si, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x33,0x37] +; NF-NEXT: xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i16, ptr %ptr %xor = xor i16 %b, -1 %v0 = xor i16 %a, %xor ; 0xff << 50 @@ -394,6 +591,14 @@ define i1 @xorflag32rm(ptr %ptr, i32 %b) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag32rm: +; NF: # %bb.0: +; NF-NEXT: xorl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x33,0x37] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i32, ptr %ptr %v0 = xor i32 %a, %b ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 @@ -409,6 +614,14 @@ define i1 @xorflag64rm(ptr %ptr, i64 %b) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag64rm: +; NF: # %bb.0: +; NF-NEXT: xorq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x33,0x37] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %a = load i64, ptr %ptr %v0 = xor i64 %a, %b ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 @@ -424,6 +637,14 @@ define i1 @xorflag8ri(i8 %a) { ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag8ri: +; NF: # %bb.0: +; NF-NEXT: xorb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf7,0x84] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i8 123, -1 %v0 = xor i8 %a, %xor ; 0xff << 50 %v1 = icmp eq i8 %v0, 0 @@ -440,6 +661,15 @@ define i1 @xorflag16ri(i16 %a) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag16ri: +; NF: # %bb.0: +; NF-NEXT: xorw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xf7,0x2d,0xfb] +; NF-NEXT: # imm = 0xFB2D +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 1234, -1 %v0 = xor i16 %a, %xor ; 0xff << 50 %v1 = icmp eq i16 %v0, 0 @@ -456,6 +686,15 @@ define i1 @xorflag32ri(i32 %a) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag32ri: +; NF: # %bb.0: +; NF-NEXT: xorl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i32 %a, 123456 ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 store i32 %v0, ptr @d64 @@ -471,6 +710,15 @@ define i1 @xorflag64ri(i64 %a) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag64ri: +; NF: # %bb.0: +; NF-NEXT: xorq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00] +; NF-NEXT: # imm = 0x1E240 +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i64 %a, 123456 ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 store i64 %v0, ptr @d64 @@ -485,6 +733,14 @@ define i1 @xorflag16ri8(i16 %a) { ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag16ri8: +; NF: # %bb.0: +; NF-NEXT: xorw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf7,0x84] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 123, -1 %v0 = xor i16 %a, %xor ; 0xff << 50 %v1 = icmp eq i16 %v0, 0 @@ -500,6 +756,14 @@ define i1 @xorflag32ri8(i32 %a) { ; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag32ri8: +; NF: # %bb.0: +; NF-NEXT: xorl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xf7,0x7b] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i32 %a, 123 ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 store i32 %v0, ptr @d64 @@ -514,6 +778,14 @@ define i1 @xorflag64ri8(i64 %a) { ; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xorflag64ri8: +; NF: # %bb.0: +; NF-NEXT: xorq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xf7,0x7b] +; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i64 %a, 123 ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 store i64 %v0, ptr @d64 @@ -525,6 +797,11 @@ define void @xor8mr_legacy(ptr %a, i8 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorb %sil, (%rdi) # encoding: [0x40,0x30,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor8mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: xorb %sil, (%rdi) # encoding: [0x40,0x30,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i8, ptr %a %xor = xor i8 %t, %b @@ -537,6 +814,11 @@ define void @xor16mr_legacy(ptr %a, i16 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorw %si, (%rdi) # encoding: [0x66,0x31,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor16mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: xorw %si, (%rdi) # encoding: [0x66,0x31,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i16, ptr %a %xor = xor i16 %t, %b @@ -549,6 +831,11 @@ define void @xor32mr_legacy(ptr %a, i32 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %esi, (%rdi) # encoding: [0x31,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor32mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: xorl %esi, (%rdi) # encoding: [0x31,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i32, ptr %a %xor = xor i32 %t, %b @@ -561,6 +848,11 @@ define void @xor64mr_legacy(ptr %a, i64 noundef %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorq %rsi, (%rdi) # encoding: [0x48,0x31,0x37] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; NF-LABEL: xor64mr_legacy: +; NF: # %bb.0: # %entry +; NF-NEXT: xorq %rsi, (%rdi) # encoding: [0x48,0x31,0x37] +; NF-NEXT: retq # encoding: [0xc3] entry: %t= load i64, ptr %a %xor = xor i64 %t, %b From 76e1a535fd7d8e9451414c76b55d82166c4c5409 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Wed, 29 May 2024 09:45:51 +0200 Subject: [PATCH 073/230] [llvm][bazel] Fix llvm-config after 3613b2683107bd60fda6d9348623be0686f6d7e3. --- utils/bazel/llvm_configs/llvm-config.h.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake index 6605ea60df99e1..629977cc11d683 100644 --- a/utils/bazel/llvm_configs/llvm-config.h.cmake +++ b/utils/bazel/llvm_configs/llvm-config.h.cmake @@ -198,4 +198,7 @@ /* Define if plugins enabled */ #cmakedefine LLVM_ENABLE_PLUGINS +/* Define if logf128 is available */ +#cmakedefine LLVM_HAS_LOGF128 + #endif From 1c6746e2db58ab7c7a5fb44cd5efa852ce932f84 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 29 May 2024 08:56:41 +0100 Subject: [PATCH 074/230] [VectorCombine] Add support for zext/sext/trunc to shuffleToIdentity (#92696) This is one of the simple additions to shuffleToIdentity that help it look through intermediate zext/sext instructions. --- .../Transforms/Vectorize/VectorCombine.cpp | 5 +- .../AArch64/shuffletoidentity.ll | 51 ++++--------------- 2 files changed, 15 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 056f0d6b3ee6c5..c3c4ee8479766e 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1746,6 +1746,9 @@ static Value *generateNewInstTree(ArrayRef Item, FixedVectorType *Ty, return Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]); if (auto *SI = dyn_cast(I)) return Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI); + if (auto *CI = dyn_cast(I)) + return Builder.CreateCast((Instruction::CastOps)CI->getOpcode(), Ops[0], + DstTy); if (II) return Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops); assert(isa(I) && "Unexpected instruction type in Generate"); @@ -1847,7 +1850,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { isa(FrontV)) { Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); - } else if (isa(FrontV)) { + } else if (isa(FrontV)) { Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); } else if (isa(FrontV)) { Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index 5cbda8a1e112ea..62fb0e6c7c11d9 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -570,19 +570,10 @@ define <8 x i16> @not_bitcast2(<4 x i32> %x, <8 x i16> %y) { define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) { ; CHECK-LABEL: @exttrunc( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[AB1:%.*]] = zext <4 x i32> [[AB]] to <4 x i64> -; CHECK-NEXT: [[AT1:%.*]] = zext <4 x i32> [[AT]] to <4 x i64> -; CHECK-NEXT: [[BB1:%.*]] = sext <4 x i32> [[BB]] to <4 x i64> -; CHECK-NEXT: [[BT1:%.*]] = sext <4 x i32> [[BT]] to <4 x i64> -; CHECK-NEXT: [[ABB:%.*]] = add <4 x i64> [[AB1]], [[BB1]] -; CHECK-NEXT: [[ABT:%.*]] = add <4 x i64> [[AT1]], [[BT1]] -; CHECK-NEXT: [[ABB1:%.*]] = trunc <4 x i64> [[ABB]] to <4 x i32> -; CHECK-NEXT: [[ABT1:%.*]] = trunc <4 x i64> [[ABT]] to <4 x i32> -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[ABB1]], <4 x i32> [[ABT1]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i32> ; CHECK-NEXT: store <8 x i32> [[R]], ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret void ; @@ -605,17 +596,9 @@ define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) { define void @zext(<8 x i16> %a, <8 x i16> %b, ptr %p) { ; CHECK-LABEL: @zext( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[AB1:%.*]] = zext <4 x i16> [[AB]] to <4 x i32> -; CHECK-NEXT: [[AT1:%.*]] = zext <4 x i16> [[AT]] to <4 x i32> -; CHECK-NEXT: [[BB1:%.*]] = zext <4 x i16> [[BB]] to <4 x i32> -; CHECK-NEXT: [[BT1:%.*]] = zext <4 x i16> [[BT]] to <4 x i32> -; CHECK-NEXT: [[ABB:%.*]] = add <4 x i32> [[AB1]], [[BB1]] -; CHECK-NEXT: [[ABT:%.*]] = add <4 x i32> [[AT1]], [[BT1]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[ABB]], <4 x i32> [[ABT]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[A:%.*]] to <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[B:%.*]] to <8 x i32> +; CHECK-NEXT: [[R:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: store <8 x i32> [[R]], ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret void ; @@ -636,17 +619,9 @@ define void @zext(<8 x i16> %a, <8 x i16> %b, ptr %p) { define void @sext(<8 x i16> %a, <8 x i16> %b, ptr %p) { ; CHECK-LABEL: @sext( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[AB1:%.*]] = sext <4 x i16> [[AB]] to <4 x i32> -; CHECK-NEXT: [[AT1:%.*]] = sext <4 x i16> [[AT]] to <4 x i32> -; CHECK-NEXT: [[BB1:%.*]] = sext <4 x i16> [[BB]] to <4 x i32> -; CHECK-NEXT: [[BT1:%.*]] = sext <4 x i16> [[BT]] to <4 x i32> -; CHECK-NEXT: [[ABB:%.*]] = add <4 x i32> [[AB1]], [[BB1]] -; CHECK-NEXT: [[ABT:%.*]] = add <4 x i32> [[AT1]], [[BT1]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[ABB]], <4 x i32> [[ABT]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[B:%.*]] to <8 x i32> +; CHECK-NEXT: [[R:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: store <8 x i32> [[R]], ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret void ; @@ -705,11 +680,7 @@ define void @zext_types(<8 x i16> %a, <8 x i32> %b, ptr %p) { define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %p) { ; CHECK-LABEL: @trunc( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i64> [[A:%.*]], <8 x i64> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i64> [[A]], <8 x i64> poison, <4 x i32> -; CHECK-NEXT: [[ABB1:%.*]] = trunc <4 x i64> [[AB]] to <4 x i32> -; CHECK-NEXT: [[ABT1:%.*]] = trunc <4 x i64> [[AT]] to <4 x i32> -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[ABB1]], <4 x i32> [[ABT1]], <8 x i32> +; CHECK-NEXT: [[R:%.*]] = trunc <8 x i64> [[A:%.*]] to <8 x i32> ; CHECK-NEXT: store <8 x i32> [[R]], ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret void ; From 850f30c3ba378321538233b3cfbd93ae2efef77f Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 29 May 2024 09:08:32 +0100 Subject: [PATCH 075/230] [ARM][MVE] Don't allow tail-predication with else predicates The test case contains a vpt block with an else predicated instruction. This might not be very unrealistic, but currently crashes due to not being able to handle the else. The instruction would need to be removed. This patch adds some extra checks that none of the instructions in vpt block is else predicated, leaving it using vctp. --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 11 +- .../CodeGen/Thumb2/mve-tailpred-vptblock.ll | 197 ++++++++++++++++++ 2 files changed, 206 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index a3144109b72040..a46c383115e2d6 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -251,6 +251,9 @@ namespace { SetVector &Predicates = PredicatedInsts[MI]; if (Exclusive && Predicates.size() != 1) return false; + // We do not know how to convert an else predicate of a VCTP. + if (getVPTInstrPredicate(*MI) == ARMVCC::Else) + return false; return llvm::any_of(Predicates, isVCTP); } @@ -305,8 +308,12 @@ namespace { // isn't predicated on entry, check whether the vctp is within the block // and that all other instructions are then predicated on it. for (auto &Block : Blocks) { - if (isEntryPredicatedOnVCTP(Block, false) || - hasImplicitlyValidVPT(Block, RDA)) + if (isEntryPredicatedOnVCTP(Block, false) && + !any_of(drop_begin(Block.getInsts()), [](const MachineInstr *MI) { + return getVPTInstrPredicate(*MI) == ARMVCC::Else; + })) + continue; + if (hasImplicitlyValidVPT(Block, RDA)) continue; SmallVectorImpl &Insts = Block.getInsts(); diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll new file mode 100644 index 00000000000000..f9b3757bb6d2ce --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll @@ -0,0 +1,197 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s + +; This loop has a vpt block that should not block tailpredication +define void @convert_vptblock(ptr %pchTarget, i16 signext %iTargetStride, ptr %pwLineMask, ptr %ptCopySize, i8 zeroext %chColour, i8 zeroext %chOpacity) { +; CHECK-LABEL: convert_vptblock: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsh.w r12, [r3, #2] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB0_1: @ %for.body.lr.ph +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: ldrsh.w r10, [r3] +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: ldrd r4, r5, [sp, #88] +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: cmp.w r10, #8 +; CHECK-NEXT: mov.w r0, #1 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: it ge +; CHECK-NEXT: movge r3, #8 +; CHECK-NEXT: vidup.u16 q0, r8, #4 +; CHECK-NEXT: sub.w r3, r10, r3 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: adds r3, #7 +; CHECK-NEXT: vmov.i16 q2, #0x100 +; CHECK-NEXT: vmov.i16 q3, #0xff +; CHECK-NEXT: add.w r9, r0, r3, lsr #3 +; CHECK-NEXT: .LBB0_2: @ %for.body +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: mov r6, r8 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: dls lr, r9 +; CHECK-NEXT: .LBB0_3: @ %do.body +; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vctp.16 r3 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u16 q5, [r2, q4] +; CHECK-NEXT: vmul.i16 q4, q5, r5 +; CHECK-NEXT: vshr.u16 q4, q4, #8 +; CHECK-NEXT: vsub.i16 q5, q2, q4 +; CHECK-NEXT: vpt.i16 eq, q4, q3 +; CHECK-NEXT: vmovt q5, q1 +; CHECK-NEXT: vctp.16 r3 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u16 q6, [r0] +; CHECK-NEXT: vsub.i16 q4, q2, q5 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vmul.i16 q5, q5, q6 +; CHECK-NEXT: vmla.i16 q5, q4, r4 +; CHECK-NEXT: vshr.u16 q4, q5, #8 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.16 q4, [r0], #8 +; CHECK-NEXT: vidup.u16 q4, r6, #4 +; CHECK-NEXT: le lr, .LBB0_3 +; CHECK-NEXT: @ %bb.4: @ %do.end +; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: add.w r0, r11, #1 +; CHECK-NEXT: add r7, r1 +; CHECK-NEXT: sxth.w r11, r0 +; CHECK-NEXT: cmp r11, r12 +; CHECK-NEXT: blt .LBB0_2 +; CHECK-NEXT: @ %bb.5: +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: bx lr +entry: + %iHeight1 = getelementptr inbounds i8, ptr %ptCopySize, i32 2 + %0 = load i16, ptr %iHeight1, align 2 + %cmp28 = icmp sgt i16 %0, 0 + br i1 %cmp28, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %1 = load i16, ptr %ptCopySize, align 2 + %conv5 = sext i16 %1 to i32 + %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 0, i32 4) + %conv6 = zext i8 %chOpacity to i16 + %.splatinsert = insertelement <8 x i16> poison, i16 %conv6, i64 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer + %conv7 = zext i8 %chColour to i16 + %.splatinsert.i = insertelement <8 x i16> poison, i16 %conv7, i64 0 + %.splat.i = shufflevector <8 x i16> %.splatinsert.i, <8 x i16> poison, <8 x i32> zeroinitializer + %conv11 = sext i16 %iTargetStride to i32 + br label %for.body + +for.cond.cleanup: ; preds = %do.end, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %do.end + %pchTarget.addr.030 = phi ptr [ %pchTarget, %for.body.lr.ph ], [ %add.ptr12, %do.end ] + %y.029 = phi i16 [ 0, %for.body.lr.ph ], [ %inc, %do.end ] + br label %do.body + +do.body: ; preds = %do.body, %for.body + %blkCnt.0 = phi i32 [ %conv5, %for.body ], [ %sub8, %do.body ] + %.pn = phi { <8 x i16>, i32 } [ %2, %for.body ], [ %13, %do.body ] + %pchTargetLine.0 = phi ptr [ %pchTarget.addr.030, %for.body ], [ %add.ptr, %do.body ] + %vStride4Offs.0 = extractvalue { <8 x i16>, i32 } %.pn, 0 + %incr.0 = extractvalue { <8 x i16>, i32 } %.pn, 1 + %3 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.0) + %4 = tail call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %pwLineMask, <8 x i16> %vStride4Offs.0, i32 8, i32 0, i32 1, <8 x i1> %3) + %5 = mul <8 x i16> %4, %.splat + %shr = lshr <8 x i16> %5, + %6 = icmp eq <8 x i16> %shr, + %7 = sub nuw nsw <8 x i16> , %shr + %sub = select <8 x i1> %6, <8 x i16> zeroinitializer, <8 x i16> %7 + %8 = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %pchTargetLine.0, i32 1, <8 x i1> %3, <8 x i8> zeroinitializer) + %9 = zext <8 x i8> %8 to <8 x i16> + %sub.i = sub nsw <8 x i16> , %sub + %10 = mul <8 x i16> %sub.i, %.splat.i + %11 = mul <8 x i16> %sub, %9 + %add.i = add <8 x i16> %10, %11 + %shr.i = lshr <8 x i16> %add.i, + %12 = trunc nuw <8 x i16> %shr.i to <8 x i8> + tail call void @llvm.masked.store.v8i8.p0(<8 x i8> %12, ptr %pchTargetLine.0, i32 1, <8 x i1> %3) + %13 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %incr.0, i32 4) + %add.ptr = getelementptr inbounds i8, ptr %pchTargetLine.0, i32 8 + %sub8 = add nsw i32 %blkCnt.0, -8 + %cmp9 = icmp sgt i32 %blkCnt.0, 8 + br i1 %cmp9, label %do.body, label %do.end + +do.end: ; preds = %do.body + %add.ptr12 = getelementptr inbounds i8, ptr %pchTarget.addr.030, i32 %conv11 + %inc = add nuw nsw i16 %y.029, 1 + %cmp = icmp slt i16 %inc, %0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; This loop has an else predicate on the vqshl, which is not very realistic but +; prevents us from converting to a vptblock without being able to remove it. +define i32 @else(ptr %s1, ptr %s2, i32 %x, ptr %d, i32 %n) { +; CHECK-LABEL: else: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldr r2, [sp, #8] +; CHECK-NEXT: cmp r2, #4 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: it ge +; CHECK-NEXT: movge r3, #4 +; CHECK-NEXT: subs r3, r2, r3 +; CHECK-NEXT: add.w r12, r3, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w r12, r3, r12, lsr #2 +; CHECK-NEXT: movs r3, #98 +; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: .LBB1_1: @ %do.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vpstet +; CHECK-NEXT: vqdmlsdht.s32 q2, q1, q0 +; CHECK-NEXT: vqshle.u32 q2, r3 +; CHECK-NEXT: vstrwt.32 q2, [r0], #16 +; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %do.end +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: pop {r7, pc} +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ] + %s2.addr.0 = phi ptr [ %s2, %entry ], [ %add.ptr1, %do.body ] + %s1.addr.0 = phi ptr [ %s1, %entry ], [ %add.ptr, %do.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0) + %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.0, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s2.addr.0, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %2, <4 x i32> %1, i32 0, i32 0, i32 1, <4 x i1> %0) + %4 = xor <4 x i1> %0, + %5 = tail call <4 x i32> @llvm.arm.mve.vshl.scalar.predicated.v4i32.v4i1(<4 x i32> %3, i32 98, i32 1, i32 0, i32 1, <4 x i1> %4) + tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %s1.addr.0, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i8, ptr %s1.addr.0, i32 16 + %add.ptr1 = getelementptr inbounds i8, ptr %s2.addr.0, i32 16 + %sub = add nsw i32 %n.addr.0, -4 + %cmp = icmp sgt i32 %n.addr.0, 4 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret i32 0 +} From 5aba0ded6c0415bc267a80469c8ea3661e012dc6 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Wed, 29 May 2024 10:18:22 +0200 Subject: [PATCH 076/230] [flang] lower assumed-rank variables specification expressions (#93477) Enable lowering of assumed-ranks in specification parts under a debug flag. I am using a debug flag because many cryptic TODOs/issues may be hit until more support is added. The development should not take too long, so I want to stay away from the noise of adding an actual experimental flag to flang-new. --- flang/lib/Lower/ConvertVariable.cpp | 33 +++++++-- .../HLFIR/convert-variable-assumed-rank.f90 | 70 +++++++++++++++++++ 2 files changed, 98 insertions(+), 5 deletions(-) create mode 100644 flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 075d0634fd1eee..8e9c1d640c3302 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -41,9 +41,15 @@ #include "flang/Optimizer/Support/Utils.h" #include "flang/Semantics/runtime-type-info.h" #include "flang/Semantics/tools.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include +static llvm::cl::opt allowAssumedRank( + "allow-assumed-rank", + llvm::cl::desc("Enable assumed rank lowering - experimental"), + llvm::cl::init(false)); + #define DEBUG_TYPE "flang-lower-variable" /// Helper to lower a scalar expression using a specific symbol mapping. @@ -1885,7 +1891,8 @@ void Fortran::lower::mapSymbolAttributes( return; } - if (Fortran::evaluate::IsAssumedRank(sym)) + const bool isAssumedRank = Fortran::evaluate::IsAssumedRank(sym); + if (isAssumedRank && !allowAssumedRank) TODO(loc, "assumed-rank variable in procedure implemented in Fortran"); Fortran::lower::BoxAnalyzer ba; @@ -1894,6 +1901,8 @@ void Fortran::lower::mapSymbolAttributes( // First deal with pointers and allocatables, because their handling here // is the same regardless of their rank. if (Fortran::semantics::IsAllocatableOrPointer(sym)) { + if (isAssumedRank) + TODO(loc, "assumed-rank pointer or allocatable"); // Get address of fir.box describing the entity. // global mlir::Value boxAlloc = preAlloc; @@ -1942,7 +1951,7 @@ void Fortran::lower::mapSymbolAttributes( if (mlir::Value len = lowerExplicitCharLen(converter, loc, ba, symMap, stmtCtx)) explicitParams.push_back(len); - if (sym.Rank() == 0) { + if (!isAssumedRank && sym.Rank() == 0) { // Do not keep scalar characters as fir.box (even when optional). // Lowering and FIR is not meant to deal with scalar characters as // fir.box outside of calls. @@ -1987,9 +1996,11 @@ void Fortran::lower::mapSymbolAttributes( } } // TODO: derived type length parameters. - lowerExplicitLowerBounds(converter, loc, ba, lbounds, symMap, stmtCtx); - lowerExplicitExtents(converter, loc, ba, lbounds, explicitExtents, symMap, - stmtCtx); + if (!isAssumedRank) { + lowerExplicitLowerBounds(converter, loc, ba, lbounds, symMap, stmtCtx); + lowerExplicitExtents(converter, loc, ba, lbounds, explicitExtents, + symMap, stmtCtx); + } genBoxDeclare(converter, symMap, sym, dummyArg, lbounds, explicitParams, explicitExtents, replace); return; @@ -2021,6 +2032,11 @@ void Fortran::lower::mapSymbolAttributes( if (isUnusedEntryDummy) { assert(!Fortran::semantics::IsAllocatableOrPointer(sym) && "handled above"); + // Need to add support for allocatable assumed-rank to use + // logic below, or to simplify it and add codegen for fir.zero + // !fir.box<> instead. + if (isAssumedRank) + TODO(loc, "assumed rank in ENTRY"); // The box is read right away because lowering code does not expect // a non pointer/allocatable symbol to be mapped to a MutableBox. mlir::Type ty = converter.genType(var); @@ -2042,6 +2058,13 @@ void Fortran::lower::mapSymbolAttributes( return false; }; + if (isAssumedRank) { + assert(isUnusedEntryDummy && "assumed rank must be pointers/allocatables " + "or descriptor dummy arguments"); + genUnusedEntryPointBox(); + return; + } + // Helper to generate scalars for the symbol properties. auto genValue = [&](const Fortran::lower::SomeExpr &expr) { return genScalarValue(converter, loc, expr, symMap, stmtCtx); diff --git a/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 new file mode 100644 index 00000000000000..748c15be84496c --- /dev/null +++ b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 @@ -0,0 +1,70 @@ +! Test lowering of assumed-rank variables +! RUN: bbc -emit-hlfir %s -allow-assumed-rank -o - | FileCheck %s + +module assumed_rank_tests +interface +subroutine takes_real(x) + real :: x(..) +end subroutine +subroutine takes_char(x) + character(*) :: x(..) +end subroutine +end interface +contains + +subroutine test_intrinsic(x) + real :: x(..) + call takes_real(x) +end subroutine + +subroutine test_character_explicit_len(x, n) + integer(8) :: n + character(n) :: x(..) + call takes_char(x) +end subroutine + +subroutine test_character_assumed_len(x) + character(*) :: x(..) + call takes_char(x) +end subroutine + +subroutine test_with_attrs(x) + real, target, optional :: x(..) + call takes_real(x) +end subroutine +! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_intrinsic( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.box> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_intrinsicEx"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +! CHECK: fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath : (!fir.box>) -> () +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_character_explicit_len( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.box>> {fir.bindc_name = "x"}, +! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref {fir.bindc_name = "n"}) { +! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref +! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 +! CHECK: %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i64 +! CHECK: %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i64 +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEx"} : (!fir.box>>, i64, !fir.dscope) -> (!fir.box>>, !fir.box>>) +! CHECK: fir.call @_QPtakes_char(%[[VAL_8]]#0) fastmath : (!fir.box>>) -> () +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_character_assumed_len( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.box>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_character_assumed_lenEx"} : (!fir.box>>, !fir.dscope) -> (!fir.box>>, !fir.box>>) +! CHECK: fir.call @_QPtakes_char(%[[VAL_2]]#0) fastmath : (!fir.box>>) -> () +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_with_attrs( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.box> {fir.bindc_name = "x", fir.optional, fir.target}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QMassumed_rank_testsFtest_with_attrsEx"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +! CHECK: fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath : (!fir.box>) -> () +end module From 326f58d7d68c33cfbb6ad54123ab9b56114de502 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Wed, 29 May 2024 10:19:07 +0200 Subject: [PATCH 077/230] [flang][HLFIR] lower hlfir.declare of assumed-ranks (#93468) hlfir.declare is in charge of ensuring that the lower bounds of its "hlfir entity" output are the ones of the source program. For non-allocatable/non-pointer assumed-ranks where the input descriptor lower bounds may not be ones, the hlfir.declare needs to be lowered to an hlfir.rebox_assumed_rank to set the lower bounds to ones. --- .../Optimizer/HLFIR/Transforms/ConvertToFIR.cpp | 17 +++++++++++------ flang/test/HLFIR/declare-codegen.fir | 9 +++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp index b8823bfa59f8f2..b48b993ddc5aff 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp @@ -348,7 +348,17 @@ class DeclareOpConversion : public mlir::OpRewritePattern { // Helper to generate the hlfir fir.box with the local lower bounds and // type parameters. auto genHlfirBox = [&]() -> mlir::Value { - if (!mlir::isa(firBase.getType())) { + if (auto baseBoxType = + mlir::dyn_cast(firBase.getType())) { + // Rebox so that lower bounds are correct. + if (baseBoxType.isAssumedRank()) + return builder.create( + loc, hlfirBaseType, firBase, + fir::LowerBoundModifierAttribute::SetToOnes); + return builder.create(loc, hlfirBaseType, firBase, + declareOp.getShape(), + /*slice=*/mlir::Value{}); + } else { llvm::SmallVector typeParams; auto maybeCharType = mlir::dyn_cast( fir::unwrapSequenceType(fir::unwrapPassByRefType(hlfirBaseType))); @@ -358,11 +368,6 @@ class DeclareOpConversion : public mlir::OpRewritePattern { return builder.create( loc, hlfirBaseType, firBase, declareOp.getShape(), /*slice=*/mlir::Value{}, typeParams); - } else { - // Rebox so that lower bounds are correct. - return builder.create(loc, hlfirBaseType, firBase, - declareOp.getShape(), - /*slice=*/mlir::Value{}); } }; if (!mlir::cast(declareOp.getOperation()) diff --git a/flang/test/HLFIR/declare-codegen.fir b/flang/test/HLFIR/declare-codegen.fir index 9f51d0fbc7afd7..bd0d61a2559dbd 100644 --- a/flang/test/HLFIR/declare-codegen.fir +++ b/flang/test/HLFIR/declare-codegen.fir @@ -210,3 +210,12 @@ func.func @dummy_scope(%arg0: !fir.ref) { // CHECK-SAME: %[[VAL_0:.*]]: !fir.ref) { // CHECK: %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope // CHECK: %[[VAL_1:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[SCOPE]] {uniq_name = "x"} : (!fir.ref, !fir.dscope) -> !fir.ref + +func.func @assumed_rank_declare(%arg0: !fir.box>) { + %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box>) -> (!fir.box>, !fir.box>) + return +} +// CHECK-LABEL: func.func @assumed_rank_declare( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.box>) { +// CHECK: %[[VAL_1:.*]] = fir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box>) -> !fir.box> +// CHECK: %[[VAL_2:.*]] = fir.rebox_assumed_rank %[[VAL_1]] lbs ones : (!fir.box>) -> !fir.box> From 6957c00a8ccd36d990ebeb3b672621ba237bd9d8 Mon Sep 17 00:00:00 2001 From: Alastair Houghton Date: Wed, 29 May 2024 09:27:30 +0100 Subject: [PATCH 078/230] [RuntimeDyld][ELF][AArch64] Fix resolveAArch64ShortBranch. (#92245) We don't know the load addresses when this function is called, so it shouldn't be trying to use them to determine whether or not the branch is short. Notably, this will fail in the case where the code is being loaded into a target in such a way that the section offsets differ between the process generating the code and the target process. rdar://127673408 --- .../RuntimeDyld/RuntimeDyldELF.cpp | 30 ++++++++++++------- .../AArch64/ELF_ARM64_xsec_branch.s | 20 +++++++++++++ 2 files changed, 40 insertions(+), 10 deletions(-) create mode 100644 llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index eaf8c35142defe..0046220611203c 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -1129,7 +1129,8 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType, bool RuntimeDyldELF::resolveAArch64ShortBranch( unsigned SectionID, relocation_iterator RelI, const RelocationValueRef &Value) { - uint64_t Address; + uint64_t TargetOffset; + unsigned TargetSectionID; if (Value.SymbolName) { auto Loc = GlobalSymbolTable.find(Value.SymbolName); @@ -1138,23 +1139,32 @@ bool RuntimeDyldELF::resolveAArch64ShortBranch( return false; const auto &SymInfo = Loc->second; - Address = - uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset( - SymInfo.getOffset())); + + TargetSectionID = SymInfo.getSectionID(); + TargetOffset = SymInfo.getOffset(); } else { - Address = uint64_t(Sections[Value.SectionID].getLoadAddress()); + TargetSectionID = Value.SectionID; + TargetOffset = 0; } - uint64_t Offset = RelI->getOffset(); - uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset); + + // We don't actually know the load addresses at this point, so if the + // branch is cross-section, we don't know exactly how far away it is. + if (TargetSectionID != SectionID) + return false; + + uint64_t SourceOffset = RelI->getOffset(); // R_AARCH64_CALL26 requires immediate to be in range -2^27 <= imm < 2^27 // If distance between source and target is out of range then we should // create thunk. - if (!isInt<28>(Address + Value.Addend - SourceAddress)) + if (!isInt<28>(TargetOffset + Value.Addend - SourceOffset)) return false; - resolveRelocation(Sections[SectionID], Offset, Address, RelI->getType(), - Value.Addend); + RelocationEntry RE(SectionID, SourceOffset, RelI->getType(), Value.Addend); + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); return true; } diff --git a/llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s b/llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s new file mode 100644 index 00000000000000..fd04f569526b9f --- /dev/null +++ b/llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s @@ -0,0 +1,20 @@ +# RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj -o %t %s +# RUN: llvm-rtdyld -triple=arm64-none-linux-gnu -verify -check=%s %t + +.globl _main +.weak _label1 + +.section .text.label1,"ax" +_label1: + nop + +.section .text.main,"ax" +_main: + b _label1 + +# Branch must be to stub in .text.main, *not* back to _label1, because +# in general sections could be loaded at arbitrary addresses in target memory, +# and when initially processing locations and generating stubs we don't know +# the final layout yet, so we can't tell if the branch offset is within range. + +# rtdyld-check: *{4}(_main) = 0x14000001 From 4ad2f415f6e30ceb116466bf81515d3765402a0f Mon Sep 17 00:00:00 2001 From: AnastasiyaChernikova Date: Wed, 29 May 2024 11:28:00 +0300 Subject: [PATCH 079/230] [Exegesis] Changing non-standard CHECK in tests to more compliant way (#93222) Fixed some FileChecks in tests. Firstly found in PR89047 (https://github.com/llvm/llvm-project/pull/89047#discussion_r1608909489) --- .../test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s | 2 +- llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s | 2 +- llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s | 2 +- .../test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s | 2 +- llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s | 2 +- llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s | 2 +- llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s | 2 +- llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s | 2 +- .../tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s | 2 +- llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test | 4 ++-- llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s | 2 +- .../X86/uops/uops-CMOV16rm-noreg-serialization.s | 2 +- 12 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s index 653f544e36ce26..1db28a84e2ff62 100644 --- a/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s +++ b/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s @@ -10,4 +10,4 @@ CHECK-NEXT: config: '' CHECK-NEXT: register_initial_values: CHECK-DAG: - '[[REG2]]=0x0' # We don't check REG3 because in the case that REG2=REG3 the check would fail -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s b/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s index f9b4860c3f4a09..cc2cf20ce05f46 100644 --- a/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s +++ b/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s @@ -9,4 +9,4 @@ CHECK-NEXT: AND64 CHECK-NEXT: config: '' CHECK-NEXT: register_initial_values: CHECK-DAG: - '[[REG1:[A-Z0-9]+_64]]=0x0' -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s index f3853eaa62ea7d..dcbbd3cf7fc355 100644 --- a/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s +++ b/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s @@ -9,4 +9,4 @@ CHECK-NEXT: ADD CHECK-NEXT: config: '' CHECK-NEXT: register_initial_values: CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0' -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s index 3d457aeb59276a..c4d9fcf2e0613a 100644 --- a/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s +++ b/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s @@ -8,4 +8,4 @@ CHECK-NEXT: ADD8 CHECK-NEXT: config: '' CHECK-NEXT: register_initial_values: CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0' -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s index 9cdd9bf029d023..384f9f1d8cf9e8 100644 --- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s +++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s @@ -8,4 +8,4 @@ CHECK-NEXT: key: CHECK-NEXT: instructions: CHECK-NEXT: 'CMOV32rr {{.*}} i_0x{{[0-9a-f]}}' CHECK-NEXT: config: '' -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s index 8b4f42dd320153..c82f5c884b9928 100644 --- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s +++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s @@ -12,4 +12,4 @@ CHECK-NEXT: - {{.*}} CHECK-NEXT: config: '' CHECK-NEXT: register_initial_values: CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0' -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s index c20e687cf20d21..26c4391bc99d6b 100644 --- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s +++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s @@ -9,4 +9,4 @@ CHECK-NEXT: SBB8rr CHECK-NEXT: config: '' CHECK-NEXT: register_initial_values: CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0' -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s index 7e67a4343f4e68..bf97a40c4bf0da 100644 --- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s +++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s @@ -10,4 +10,4 @@ CHECK-NEXT: SQRTSSr CHECK-NEXT: config: '' CHECK-NEXT: register_initial_values: CHECK-NOT: crashed -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s index 4fee6fe927097a..08beccfe7704f4 100644 --- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s +++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s @@ -9,4 +9,4 @@ CHECK-NEXT: ADD32rr CHECK-NEXT: config: '' CHECK-NEXT: register_initial_values: CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0' -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test b/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test index 382e742144ac45..f27101d8966080 100644 --- a/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test +++ b/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test @@ -9,7 +9,7 @@ CHECK-NEXT: SBB8rr CHECK-NEXT: config: '' CHECK-NEXT: register_initial_values: CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0' -CHECK-LAST: ... +CHECK-DAG: ... CHECK1-NOT: SBB8rr @@ -21,4 +21,4 @@ CHECK2-NEXT: SBB8rr CHECK2-NEXT: config: '' CHECK2-NEXT: register_initial_values: CHECK2-DAG: - '[[REG1:[A-Z0-9]+]]=0x0' -CHECK2-LAST: ... +CHECK2-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s index af1662d93a7440..2a8cc8e34450ad 100644 --- a/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s @@ -16,4 +16,4 @@ CHECK-NEXT: {{.*}} CHECK-NEXT: num_repetitions: 10000 CHECK-NEXT: measurements: CHECK-NEXT: {{.*}} value: 0.0001, per_snippet_value: 0.0002 {{.*}} -CHECK-LAST: ... +CHECK-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s b/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s index 302c2b0ee722b0..1e673e806da212 100644 --- a/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s +++ b/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s @@ -8,4 +8,4 @@ CHECK-YAML-NEXT: mode: uops CHECK-YAML-NEXT: key: CHECK-YAML-NEXT: instructions: CHECK-YAML-NEXT: - 'CMOV16rm {{[A-Z0-9]+}} {{[A-Z0-9]+}} {{[A-Z0-9]+}} i_0x1 %noreg i_0x0 %noreg i_0x{{[0-9a-f]}}' -CHECK-YAML-LAST: ... +CHECK-YAML-DAG: ... From 93d8d74ae6717c8e7c8b25ad5a6cfa212d3a4d37 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 29 May 2024 09:36:53 +0100 Subject: [PATCH 080/230] [VectorCombine] Remove requirement for Instructions in shuffleToIdentity (#93543) This removes the check that both operands of the original shuffle are instructions, which is a relic from a previous version that held more variables as Instructions. --- .../Transforms/Vectorize/VectorCombine.cpp | 3 +- .../AArch64/shuffletoidentity.ll | 29 +++++++------------ 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index c3c4ee8479766e..7ecfe5218ef67c 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1760,8 +1760,7 @@ static Value *generateNewInstTree(ArrayRef Item, FixedVectorType *Ty, // do so. bool VectorCombine::foldShuffleToIdentity(Instruction &I) { auto *Ty = dyn_cast(I.getType()); - if (!Ty || !isa(I.getOperand(0)) || - !isa(I.getOperand(1))) + if (!Ty) return false; SmallVector Start(Ty->getNumElements()); diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index 62fb0e6c7c11d9..c2e9be56889678 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -15,9 +15,7 @@ define <8 x i8> @trivial(<8 x i8> %a) { define <4 x i32> @add_same_operands(<4 x i32> %x) { ; CHECK-LABEL: @add_same_operands( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[SHUF]] -; CHECK-NEXT: [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[REVSHUF:%.*]] = add <4 x i32> [[X:%.*]], [[X]] ; CHECK-NEXT: ret <4 x i32> [[REVSHUF]] ; %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> @@ -364,8 +362,7 @@ define <8 x i8> @inner_shuffle(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { define <4 x i32> @extrause_add_same_operands(<4 x i32> %x) { ; CHECK-LABEL: @extrause_add_same_operands( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[SHUF]] -; CHECK-NEXT: [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[REVSHUF:%.*]] = add <4 x i32> [[X]], [[X]] ; CHECK-NEXT: [[ADD2:%.*]] = add <4 x i32> [[SHUF]], [[REVSHUF]] ; CHECK-NEXT: ret <4 x i32> [[ADD2]] ; @@ -513,9 +510,7 @@ define <8 x half> @fma(<8 x half> %a, <8 x half> %b, <8 x half> %c) { define <4 x i64> @single_zext(<4 x i32> %x) { ; CHECK-LABEL: @single_zext( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i32> [[SHUF]] to <4 x i64> -; CHECK-NEXT: [[REVSHUF:%.*]] = shufflevector <4 x i64> [[ZEXT]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: [[REVSHUF:%.*]] = zext <4 x i32> [[X:%.*]] to <4 x i64> ; CHECK-NEXT: ret <4 x i64> [[REVSHUF]] ; %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> @@ -695,10 +690,8 @@ define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %p) { define <4 x i64> @zext_chain(<4 x i16> %x) { ; CHECK-LABEL: @zext_chain( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i16> [[X:%.*]], <4 x i16> poison, <4 x i32> -; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[SHUF]] to <4 x i32> -; CHECK-NEXT: [[SEXT:%.*]] = sext <4 x i32> [[ZEXT]] to <4 x i64> -; CHECK-NEXT: [[REVSHUF:%.*]] = shufflevector <4 x i64> [[SEXT]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i16> [[X:%.*]] to <4 x i32> +; CHECK-NEXT: [[REVSHUF:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> ; CHECK-NEXT: ret <4 x i64> [[REVSHUF]] ; %shuf = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> @@ -899,13 +892,11 @@ entry: define <4 x i8> @singleop(<4 x i8> %a, <4 x i8> %b) { ; CHECK-LABEL: @singleop( -; CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[A:%.*]], <4 x i8> poison, <4 x i32> -; CHECK-NEXT: [[B1:%.*]] = shufflevector <4 x i8> [[B:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[A2:%.*]] = zext <4 x i8> [[A1]] to <4 x i16> -; CHECK-NEXT: [[B2:%.*]] = zext <4 x i8> [[B1]] to <4 x i16> -; CHECK-NEXT: [[AB:%.*]] = add <4 x i16> [[A2]], [[B2]] -; CHECK-NEXT: [[T:%.*]] = trunc <4 x i16> [[AB]] to <4 x i8> -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[T]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[B:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[A:%.*]] to <4 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = trunc <4 x i16> [[TMP4]] to <4 x i8> ; CHECK-NEXT: ret <4 x i8> [[R]] ; %a1 = shufflevector <4 x i8> %a, <4 x i8> poison, <4 x i32> From fa649df8e54c2aa8921a42ad8d10e1e45700e5d7 Mon Sep 17 00:00:00 2001 From: Daniel Grumberg Date: Wed, 29 May 2024 09:47:23 +0100 Subject: [PATCH 081/230] [clang][ExtractAPI] Flatten all enum cases from anonymous enums at top level (#93559) rdar://128863241 --- .../clang/ExtractAPI/ExtractAPIVisitor.h | 65 +++++----- .../ExtractAPI/anonymous_record_no_typedef.c | 42 ++----- clang/test/ExtractAPI/enum.c | 112 ------------------ clang/tools/libclang/CXExtractAPI.cpp | 3 + 4 files changed, 54 insertions(+), 168 deletions(-) diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h index 8ccebe457ed530..76d7fd798bed3a 100644 --- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h +++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h @@ -21,6 +21,7 @@ #include "clang/AST/DeclTemplate.h" #include "clang/AST/ParentMapContext.h" #include "clang/AST/RecursiveASTVisitor.h" +#include "clang/Basic/LLVM.h" #include "clang/Basic/Module.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/Specifiers.h" @@ -127,7 +128,7 @@ class ExtractAPIVisitorBase : public RecursiveASTVisitor { protected: /// Collect API information for the enum constants and associate with the /// parent enum. - void recordEnumConstants(EnumRecord *EnumRecord, + void recordEnumConstants(SymbolReference Container, const EnumDecl::enumerator_range Constants); /// Collect API information for the Objective-C methods and associate with the @@ -248,12 +249,8 @@ class ExtractAPIVisitorBase : public RecursiveASTVisitor { clang::index::generateUSRForDecl(Tag, TagUSR); if (auto *Record = llvm::dyn_cast_if_present( API.findRecordForUSR(TagUSR))) { - if (Record->IsEmbeddedInVarDeclarator) { + if (Record->IsEmbeddedInVarDeclarator) NewRecordContext->stealRecordChain(*Record); - auto *NewRecord = cast(NewRecordContext); - if (NewRecord->Comment.empty()) - NewRecord->Comment = Record->Comment; - } } } }; @@ -394,17 +391,6 @@ bool ExtractAPIVisitorBase::VisitEnumDecl(const EnumDecl *Decl) { if (!getDerivedExtractAPIVisitor().shouldDeclBeIncluded(Decl)) return true; - SmallString<128> QualifiedNameBuffer; - // Collect symbol information. - StringRef Name = Decl->getName(); - if (Name.empty()) - Name = getTypedefName(Decl); - if (Name.empty()) { - llvm::raw_svector_ostream OS(QualifiedNameBuffer); - Decl->printQualifiedName(OS); - Name = QualifiedNameBuffer; - } - SmallString<128> USR; index::generateUSRForDecl(Decl, USR); PresumedLoc Loc = @@ -420,13 +406,29 @@ bool ExtractAPIVisitorBase::VisitEnumDecl(const EnumDecl *Decl) { DeclarationFragmentsBuilder::getFragmentsForEnum(Decl); DeclarationFragments SubHeading = DeclarationFragmentsBuilder::getSubHeading(Decl); - auto *ER = API.createRecord( - USR, Name, createHierarchyInformationForDecl(*Decl), Loc, - AvailabilityInfo::createFromDecl(Decl), Comment, Declaration, SubHeading, - isInSystemHeader(Decl), isEmbeddedInVarDeclarator(*Decl)); + + // Collect symbol information. + SymbolReference ParentContainer; + + if (Decl->hasNameForLinkage()) { + StringRef Name = Decl->getName(); + if (Name.empty()) + Name = getTypedefName(Decl); + + auto *ER = API.createRecord( + USR, Name, createHierarchyInformationForDecl(*Decl), Loc, + AvailabilityInfo::createFromDecl(Decl), Comment, Declaration, + SubHeading, isInSystemHeader(Decl), false); + ParentContainer = SymbolReference(ER); + } else { + // If this an anonymous enum then the parent scope of the constants is the + // top level namespace. + ParentContainer = {}; + } // Now collect information about the enumerators in this enum. - getDerivedExtractAPIVisitor().recordEnumConstants(ER, Decl->enumerators()); + getDerivedExtractAPIVisitor().recordEnumConstants(ParentContainer, + Decl->enumerators()); return true; } @@ -1197,7 +1199,7 @@ bool ExtractAPIVisitorBase::VisitObjCCategoryDecl( /// parent enum. template void ExtractAPIVisitorBase::recordEnumConstants( - EnumRecord *EnumRecord, const EnumDecl::enumerator_range Constants) { + SymbolReference Container, const EnumDecl::enumerator_range Constants) { for (const auto *Constant : Constants) { // Collect symbol information. StringRef Name = Constant->getName(); @@ -1218,9 +1220,8 @@ void ExtractAPIVisitorBase::recordEnumConstants( DeclarationFragmentsBuilder::getSubHeading(Constant); API.createRecord( - USR, Name, createHierarchyInformationForDecl(*Constant), Loc, - AvailabilityInfo::createFromDecl(Constant), Comment, Declaration, - SubHeading, isInSystemHeader(Constant)); + USR, Name, Container, Loc, AvailabilityInfo::createFromDecl(Constant), + Comment, Declaration, SubHeading, isInSystemHeader(Constant)); } } @@ -1469,7 +1470,17 @@ class ExtractAPIVisitor bool shouldDeclBeIncluded(const Decl *D) const { return true; } const RawComment *fetchRawCommentForDecl(const Decl *D) const { - return this->Context.getRawCommentForDeclNoCache(D); + if (const auto *Comment = this->Context.getRawCommentForDeclNoCache(D)) + return Comment; + + if (const auto *Declarator = dyn_cast(D)) { + const auto *TagTypeDecl = Declarator->getType()->getAsTagDecl(); + if (TagTypeDecl && TagTypeDecl->isEmbeddedInDeclarator() && + TagTypeDecl->isCompleteDefinition()) + return this->Context.getRawCommentForDeclNoCache(TagTypeDecl); + } + + return nullptr; } }; diff --git a/clang/test/ExtractAPI/anonymous_record_no_typedef.c b/clang/test/ExtractAPI/anonymous_record_no_typedef.c index 71e460afb12833..789316ca8930b8 100644 --- a/clang/test/ExtractAPI/anonymous_record_no_typedef.c +++ b/clang/test/ExtractAPI/anonymous_record_no_typedef.c @@ -84,21 +84,15 @@ struct Vehicle { // TYPE: "text": "The type of vehicle." // TYPE: "title": "type" - // BICYCLE: "!testRelLabel": "memberOf $ c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Bicycle $ c:@S@Vehicle@FI@type" // BICYCLE-LABEL: "!testLabel": "c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Bicycle" // BICYCLE: "title": "Bicycle" // BICYCLE: "pathComponents": [ - // BICYCLE-NEXT: "Vehicle", - // BICYCLE-NEXT: "type", // BICYCLE-NEXT: "Bicycle" // BICYCLE-NEXT: ] - // CAR: "!testRelLabel": "memberOf $ c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Car $ c:@S@Vehicle@FI@type" // CAR-LABEL: "!testLabel": "c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Car" // CAR: "title": "Car" // CAR: "pathComponents": [ - // CAR-NEXT: "Vehicle", - // CAR-NEXT: "type", // CAR-NEXT: "Car" // CAR-NEXT: ] @@ -151,32 +145,22 @@ struct Vehicle { // NAME-NEXT: ] }; -// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALENUM +// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALCASE +// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALOTHERCASE enum { GlobalCase, GlobalOtherCase }; -// GLOBALENUM-DAG: "!testRelLabel": "memberOf $ c:@Ea@GlobalCase@GlobalCase $ c:@Ea@GlobalCase" -// GLOBALENUM-DAG: "!testRelLabel": "memberOf $ c:@Ea@GlobalCase@GlobalOtherCase $ c:@Ea@GlobalCase" -// GLOBALENUM-LABEL: "!testLabel": "c:@Ea@GlobalCase" -// GLOBALENUM: "declarationFragments": [ -// GLOBALENUM-NEXT: { -// GLOBALENUM-NEXT: "kind": "keyword", -// GLOBALENUM-NEXT: "spelling": "enum" -// GLOBALENUM-NEXT: }, -// GLOBALENUM-NEXT: { -// GLOBALENUM-NEXT: "kind": "text", -// GLOBALENUM-NEXT: "spelling": " : " -// GLOBALENUM-NEXT: }, -// GLOBALENUM-NEXT: { -// GLOBALENUM-NEXT: "kind": "typeIdentifier", -// GLOBALENUM-NEXT: "preciseIdentifier": "c:i", -// GLOBALENUM-NEXT: "spelling": "unsigned int" -// GLOBALENUM-NEXT: }, -// GLOBALENUM-NEXT: { -// GLOBALENUM-NEXT: "kind": "text", -// GLOBALENUM-NEXT: "spelling": " { ... };" -// GLOBALENUM-NEXT: } -// GLOBALENUM-NEXT: ] +// GLOBALCASE-LABEL: "!testLabel": "c:@Ea@GlobalCase@GlobalCase" +// GLOBALCASE: "title": "GlobalCase" +// GLOBALCASE: "pathComponents": [ +// GLOBALCASE-NEXT: "GlobalCase" +// GLOBALCASE-NEXT: ] + +// GLOBALOTHERCASE-LABEL: "!testLabel": "c:@Ea@GlobalCase@GlobalOtherCase" +// GLOBALOTHERCASE: "title": "GlobalOtherCase" +// GLOBALOTHERCASE: "pathComponents": [ +// GLOBALOTHERCASE-NEXT: "GlobalOtherCase" +// GLOBALOTHERCASE-NEXT: ] // expected-no-diagnostics diff --git a/clang/test/ExtractAPI/enum.c b/clang/test/ExtractAPI/enum.c index 67e003834a7d58..58170aa0e1d906 100644 --- a/clang/test/ExtractAPI/enum.c +++ b/clang/test/ExtractAPI/enum.c @@ -115,18 +115,6 @@ enum { "source": "c:@E@Direction@West", "target": "c:@E@Direction", "targetFallback": "Direction" - }, - { - "kind": "memberOf", - "source": "c:@Ea@Constant@Constant", - "target": "c:@Ea@Constant", - "targetFallback": "enum (unnamed)" - }, - { - "kind": "memberOf", - "source": "c:@Ea@OtherConstant@OtherConstant", - "target": "c:@Ea@OtherConstant", - "targetFallback": "enum (unnamed)" } ], "symbols": [ @@ -677,55 +665,6 @@ enum { "West" ] }, - { - "accessLevel": "public", - "declarationFragments": [ - { - "kind": "keyword", - "spelling": "enum" - }, - { - "kind": "text", - "spelling": " : " - }, - { - "kind": "typeIdentifier", - "preciseIdentifier": "c:i", - "spelling": "unsigned int" - }, - { - "kind": "text", - "spelling": " { ... };" - } - ], - "identifier": { - "interfaceLanguage": "c", - "precise": "c:@Ea@Constant" - }, - "kind": { - "displayName": "Enumeration", - "identifier": "c.enum" - }, - "location": { - "position": { - "character": 0, - "line": 16 - }, - "uri": "file://INPUT_DIR/input.h" - }, - "names": { - "navigator": [ - { - "kind": "identifier", - "spelling": "enum (unnamed)" - } - ], - "title": "enum (unnamed)" - }, - "pathComponents": [ - "enum (unnamed)" - ] - }, { "accessLevel": "public", "declarationFragments": [ @@ -765,59 +704,9 @@ enum { "title": "Constant" }, "pathComponents": [ - "enum (unnamed)", "Constant" ] }, - { - "accessLevel": "public", - "declarationFragments": [ - { - "kind": "keyword", - "spelling": "enum" - }, - { - "kind": "text", - "spelling": " : " - }, - { - "kind": "typeIdentifier", - "preciseIdentifier": "c:i", - "spelling": "unsigned int" - }, - { - "kind": "text", - "spelling": " { ... };" - } - ], - "identifier": { - "interfaceLanguage": "c", - "precise": "c:@Ea@OtherConstant" - }, - "kind": { - "displayName": "Enumeration", - "identifier": "c.enum" - }, - "location": { - "position": { - "character": 0, - "line": 20 - }, - "uri": "file://INPUT_DIR/input.h" - }, - "names": { - "navigator": [ - { - "kind": "identifier", - "spelling": "enum (unnamed)" - } - ], - "title": "enum (unnamed)" - }, - "pathComponents": [ - "enum (unnamed)" - ] - }, { "accessLevel": "public", "declarationFragments": [ @@ -857,7 +746,6 @@ enum { "title": "OtherConstant" }, "pathComponents": [ - "enum (unnamed)", "OtherConstant" ] } diff --git a/clang/tools/libclang/CXExtractAPI.cpp b/clang/tools/libclang/CXExtractAPI.cpp index d74f3740406c5c..c35558e66fcb96 100644 --- a/clang/tools/libclang/CXExtractAPI.cpp +++ b/clang/tools/libclang/CXExtractAPI.cpp @@ -45,6 +45,9 @@ struct LibClangExtractAPIVisitor : ExtractAPIVisitor(Context, API) {} const RawComment *fetchRawCommentForDecl(const Decl *D) const { + if (const auto *Comment = Base::fetchRawCommentForDecl(D)) + return Comment; + return Context.getRawCommentForAnyRedecl(D); } From f6ace2bc15bfde4cc9bd140859fa92618568a006 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 29 May 2024 09:51:05 +0100 Subject: [PATCH 082/230] [AArch64] Expand vector ops when NEON and SVE are unavailable. (#90833) Unlike `+noneon` we must assume that vector types are available, i.e. it is valid to pass/return vector arguments to and from functions. However, the compiler must make sure to scalarize any vector operations. --- .../Target/AArch64/AArch64ISelLowering.cpp | 79 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 6 +- ...streaming-mode-fixed-length-and-combine.ll | 226 +- ...treaming-mode-fixed-length-bit-counting.ll | 2167 +++++++- ...sve-streaming-mode-fixed-length-bitcast.ll | 30 +- ...e-streaming-mode-fixed-length-bitselect.ll | 32 +- .../sve-streaming-mode-fixed-length-concat.ll | 119 +- ...e-streaming-mode-fixed-length-ext-loads.ll | 338 +- ...ing-mode-fixed-length-extract-subvector.ll | 50 +- ...ng-mode-fixed-length-extract-vector-elt.ll | 54 +- ...e-streaming-mode-fixed-length-fcopysign.ll | 846 ++- ...ve-streaming-mode-fixed-length-fp-arith.ll | 3177 ++++++++--- ...streaming-mode-fixed-length-fp-compares.ll | 4788 +++++++++-------- ...-streaming-mode-fixed-length-fp-convert.ll | 29 +- ...aming-mode-fixed-length-fp-extend-trunc.ll | 732 ++- .../sve-streaming-mode-fixed-length-fp-fma.ll | 569 +- ...e-streaming-mode-fixed-length-fp-minmax.ll | 2040 ++++--- ...eaming-mode-fixed-length-fp-reduce-fa64.ll | 26 +- ...e-streaming-mode-fixed-length-fp-reduce.ll | 1438 +++-- ...streaming-mode-fixed-length-fp-rounding.ll | 2030 ++++++- ...e-streaming-mode-fixed-length-fp-select.ll | 305 +- ...e-streaming-mode-fixed-length-fp-to-int.ll | 2254 ++++++-- ...-streaming-mode-fixed-length-fp-vselect.ll | 511 +- ...ing-mode-fixed-length-insert-vector-elt.ll | 367 +- ...e-streaming-mode-fixed-length-int-arith.ll | 2132 +++++++- ...treaming-mode-fixed-length-int-compares.ll | 1048 +++- ...sve-streaming-mode-fixed-length-int-div.ll | 2044 +++---- ...streaming-mode-fixed-length-int-extends.ll | 3716 ++++++++++--- ...eaming-mode-fixed-length-int-immediates.ll | 3425 +++++++++++- ...sve-streaming-mode-fixed-length-int-log.ll | 1503 +++++- ...-streaming-mode-fixed-length-int-minmax.ll | 2404 ++++++++- ...ing-mode-fixed-length-int-mla-neon-fa64.ll | 47 +- ...ve-streaming-mode-fixed-length-int-mulh.ll | 1664 +++++- ...-streaming-mode-fixed-length-int-reduce.ll | 1642 +++++- ...sve-streaming-mode-fixed-length-int-rem.ll | 2654 ++++----- ...-streaming-mode-fixed-length-int-select.ll | 581 +- ...-streaming-mode-fixed-length-int-shifts.ll | 1632 +++++- ...e-streaming-mode-fixed-length-int-to-fp.ll | 1895 +++++-- ...streaming-mode-fixed-length-int-vselect.ll | 817 ++- ...-streaming-mode-fixed-length-ld2-alloca.ll | 118 +- ...reaming-mode-fixed-length-limit-duplane.ll | 145 +- .../sve-streaming-mode-fixed-length-loads.ll | 33 +- ...-streaming-mode-fixed-length-log-reduce.ll | 888 ++- ...streaming-mode-fixed-length-masked-load.ll | 3314 +++++++++--- ...treaming-mode-fixed-length-masked-store.ll | 806 ++- ...eaming-mode-fixed-length-optimize-ptrue.ll | 937 +++- ...streaming-mode-fixed-length-permute-rev.ll | 472 +- ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 1261 ++++- .../sve-streaming-mode-fixed-length-ptest.ll | 399 +- .../sve-streaming-mode-fixed-length-rev.ll | 936 +++- ...e-streaming-mode-fixed-length-sdiv-pow2.ll | 768 ++- ...sve-streaming-mode-fixed-length-shuffle.ll | 72 +- ...treaming-mode-fixed-length-splat-vector.ll | 245 +- .../sve-streaming-mode-fixed-length-stores.ll | 60 +- ...e-streaming-mode-fixed-length-subvector.ll | 8 +- ...treaming-mode-fixed-length-trunc-stores.ll | 64 +- .../sve-streaming-mode-fixed-length-trunc.ll | 2789 +++++++++- ...eaming-mode-fixed-length-vector-shuffle.ll | 339 +- .../sve-streaming-mode-test-register-mov.ll | 6 +- 59 files changed, 49850 insertions(+), 13227 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 25ba8d8500306f..814bbe27049820 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -360,24 +360,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->hasNEON()) { addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); - // Someone set us up the NEON. - addDRTypeForNEON(MVT::v2f32); - addDRTypeForNEON(MVT::v8i8); - addDRTypeForNEON(MVT::v4i16); - addDRTypeForNEON(MVT::v2i32); - addDRTypeForNEON(MVT::v1i64); - addDRTypeForNEON(MVT::v1f64); - addDRTypeForNEON(MVT::v4f16); - addDRTypeForNEON(MVT::v4bf16); - - addQRTypeForNEON(MVT::v4f32); - addQRTypeForNEON(MVT::v2f64); - addQRTypeForNEON(MVT::v16i8); - addQRTypeForNEON(MVT::v8i16); - addQRTypeForNEON(MVT::v4i32); - addQRTypeForNEON(MVT::v2i64); - addQRTypeForNEON(MVT::v8f16); - addQRTypeForNEON(MVT::v8bf16); + + addDRType(MVT::v2f32); + addDRType(MVT::v8i8); + addDRType(MVT::v4i16); + addDRType(MVT::v2i32); + addDRType(MVT::v1i64); + addDRType(MVT::v1f64); + addDRType(MVT::v4f16); + addDRType(MVT::v4bf16); + + addQRType(MVT::v4f32); + addQRType(MVT::v2f64); + addQRType(MVT::v16i8); + addQRType(MVT::v8i16); + addQRType(MVT::v4i32); + addQRType(MVT::v2i64); + addQRType(MVT::v8f16); + addQRType(MVT::v8bf16); } if (Subtarget->hasSVEorSME()) { @@ -1125,7 +1125,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - if (Subtarget->hasNEON()) { + if (Subtarget->isNeonAvailable()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: for (auto Op : @@ -1337,6 +1337,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // FADDP custom lowering for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::FADD, VT, Custom); + } else /* !isNeonAvailable */ { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); + + if (VT.is128BitVector() || VT.is64BitVector()) { + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::BITCAST, VT, + Subtarget->isLittleEndian() ? Legal : Expand); + } + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } + } } if (Subtarget->hasSME()) { @@ -2020,14 +2038,16 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::ZERO_EXTEND, VT, Default); } -void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { +void AArch64TargetLowering::addDRType(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); - addTypeForNEON(VT); + if (Subtarget->isNeonAvailable()) + addTypeForNEON(VT); } -void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { +void AArch64TargetLowering::addQRType(MVT VT) { addRegisterClass(VT, &AArch64::FPR128RegClass); - addTypeForNEON(VT); + if (Subtarget->isNeonAvailable()) + addTypeForNEON(VT); } EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, @@ -9445,7 +9465,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { - if (!Subtarget->hasNEON()) + if (!Subtarget->isNeonAvailable() && + !Subtarget->useSVEForFixedLengthVectors()) return SDValue(); EVT VT = Op.getValueType(); @@ -14141,6 +14162,13 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); } +bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles( + EVT VT, unsigned DefinedValues) const { + if (!Subtarget->isNeonAvailable()) + return false; + return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); +} + bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { // Currently no fixed length shuffles that require SVE are legal. if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) @@ -19838,7 +19866,8 @@ performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // help, for example, to produce ssra from sshr+add. static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - if (VT != MVT::i64) + if (VT != MVT::i64 || + DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64)) return SDValue(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index a44a3d35d2f9c8..73bc9ad53bb8a3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1017,8 +1017,10 @@ class AArch64TargetLowering : public TargetLowering { void addTypeForNEON(MVT VT); void addTypeForFixedLengthSVE(MVT VT); - void addDRTypeForNEON(MVT VT); - void addQRTypeForNEON(MVT VT); + void addDRType(MVT VT); + void addQRType(MVT VT); + + bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override; unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll index ed3222529a3bb9..4cdb175f55c9cc 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll @@ -18,8 +18,15 @@ define <4 x i8> @vls_sve_and_4xi8(<4 x i8> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_4xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff000000ff0000 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #12] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <4 x i8> %b, ret <4 x i8> %c @@ -37,8 +44,21 @@ define <8 x i8> @vls_sve_and_8xi8(<8 x i8> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_8xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff00 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #14] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <8 x i8> %b, ret <8 x i8> %c @@ -56,8 +76,33 @@ define <16 x i8> @vls_sve_and_16xi8(<16 x i8> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_16xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.2d, #0xff00ff00ff00ff00 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #30] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %c = and <16 x i8> %b, ret <16 x i8> %c @@ -78,9 +123,57 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_32xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v2.2d, #0xff00ff00ff00ff00 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #46] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #62] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #60] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #58] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #54] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #52] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #50] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = and <32 x i8> %ap, @@ -102,9 +195,11 @@ define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_2xi16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov v0.s[0], wzr -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <2 x i16> %b, ret <2 x i16> %c @@ -122,8 +217,15 @@ define <4 x i16> @vls_sve_and_4xi16(<4 x i16> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_4xi16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xffff0000ffff0000 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #12] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <4 x i16> %b, ret <4 x i16> %c @@ -141,8 +243,21 @@ define <8 x i16> @vls_sve_and_8xi16(<8 x i16> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_8xi16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.2d, #0xffff0000ffff0000 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #28] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %c = and <8 x i16> %b, ret <8 x i16> %c @@ -163,9 +278,33 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_16xi16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v2.2d, #0xffff0000ffff0000 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #44] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #60] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %c = and <16 x i16> %b, ret <16 x i16> %c @@ -183,9 +322,11 @@ define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_2xi32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov v0.s[0], wzr -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <2 x i32> %b, ret <2 x i32> %c @@ -203,8 +344,13 @@ define <4 x i32> @vls_sve_and_4xi32(<4 x i32> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_4xi32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.2d, #0xffffffff00000000 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %c = and <4 x i32> %b, ret <4 x i32> %c @@ -225,9 +371,17 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_8xi32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v2.2d, #0xffffffff00000000 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %c = and <8 x i32> %b, ret <8 x i32> %c @@ -245,7 +399,11 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_2xi64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov v0.d[0], xzr +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: stp xzr, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %c = and <2 x i64> %b, ret <2 x i64> %c @@ -265,8 +423,16 @@ define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_4xi64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov v0.d[0], xzr -; NONEON-NOSVE-NEXT: mov v1.d[0], xzr +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #40] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp xzr, x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: stp xzr, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %c = and <4 x i64> %b, ret <4 x i64> %c diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll index cd6c2b489efe4c..f920efeb4892d1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll @@ -22,12 +22,26 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: mov w8, #8 // =0x8 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h -; NONEON-NOSVE-NEXT: sub v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w9 +; NONEON-NOSVE-NEXT: clz w10, w10 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: sub w9, w9, #24 +; NONEON-NOSVE-NEXT: sub w10, w10, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w11 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w10, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %op) ret <4 x i8> %res @@ -44,7 +58,42 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op) ret <8 x i8> %res @@ -61,7 +110,74 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op) ret <16 x i8> %res @@ -79,10 +195,140 @@ define void @ctlz_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctlz_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: clz v0.16b, v0.16b -; NONEON-NOSVE-NEXT: clz v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op) @@ -103,12 +349,17 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s -; NONEON-NOSVE-NEXT: sub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w9 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: sub w9, w9, #16 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -125,7 +376,26 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -142,7 +412,42 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -160,10 +465,76 @@ define void @ctlz_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctlz_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h -; NONEON-NOSVE-NEXT: clz v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op) @@ -182,7 +553,15 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -199,7 +578,20 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -217,10 +609,32 @@ define void @ctlz_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctlz_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: clz v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op) @@ -239,23 +653,13 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushr d1, d0, #1 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #2 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #4 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #8 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #16 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #32 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: mvn v0.8b, v0.8b -; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h -; NONEON-NOSVE-NEXT: uaddlp v0.1d, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -272,23 +676,15 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #1 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #2 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #4 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #8 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #16 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #32 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -306,42 +702,22 @@ define void @ctlz_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctlz_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #1 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #1 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #2 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #2 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #4 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #4 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #8 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #8 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #16 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #16 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #32 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #32 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: mvn v1.16b, v1.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s -; NONEON-NOSVE-NEXT: uaddlp v1.2d, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op) @@ -365,10 +741,37 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #66] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: fmov d1, x9 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d2, x10 +; NONEON-NOSVE-NEXT: fmov d3, x8 ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: cnt v1.8b, v1.8b +; NONEON-NOSVE-NEXT: cnt v2.8b, v2.8b +; NONEON-NOSVE-NEXT: cnt v3.8b, v3.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h1, v1.8b +; NONEON-NOSVE-NEXT: uaddlv h2, v2.8b +; NONEON-NOSVE-NEXT: uaddlv h3, v3.8b +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: stp q3, q2, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op) ret <4 x i8> %res @@ -385,7 +788,67 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v8i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 +; NONEON-NOSVE-NEXT: str d0, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #135] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #134] ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #133] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #132] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #131] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #130] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #129] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #128] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strb w8, [sp, #143] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #141] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #139] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #137] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #136] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op) ret <8 x i8> %res @@ -402,7 +865,126 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #304 +; NONEON-NOSVE-NEXT: str x29, [sp, #288] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 304 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #271] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #270] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #240] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #269] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #224] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #268] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #267] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #266] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #176] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #265] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #160] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #264] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #263] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #262] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #261] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #260] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #259] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #258] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #257] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #256] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #240] +; NONEON-NOSVE-NEXT: strb w8, [sp, #287] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #224] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #286] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #208] +; NONEON-NOSVE-NEXT: strb w8, [sp, #285] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: strb w8, [sp, #283] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #282] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #281] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: strb w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strb w8, [sp, #279] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: strb w8, [sp, #278] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #277] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: strb w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #275] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #274] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #273] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #272] +; NONEON-NOSVE-NEXT: add sp, sp, #304 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op) ret <16 x i8> %res @@ -420,10 +1002,240 @@ define void @ctpop_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctpop_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b +; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #576 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 592 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #512] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #543] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #542] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #240] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #541] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #224] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #540] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #539] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #538] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #176] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #537] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #160] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #536] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #535] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #534] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #533] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #532] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #531] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #530] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #529] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #528] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #527] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #526] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #496] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #525] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #480] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #524] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #464] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #523] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #448] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #522] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #432] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #521] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #416] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #520] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #400] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #519] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #384] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #518] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #368] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #517] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #352] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #516] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #336] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #515] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #514] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #304] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #513] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #288] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #512] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #272] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #240] +; NONEON-NOSVE-NEXT: strb w8, [sp, #575] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #224] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #574] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #208] +; NONEON-NOSVE-NEXT: strb w8, [sp, #573] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #572] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: strb w8, [sp, #571] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: strb w8, [sp, #570] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #569] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: strb w8, [sp, #568] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strb w8, [sp, #567] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: strb w8, [sp, #566] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #565] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: strb w8, [sp, #564] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #563] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #562] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #561] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #560] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #496] +; NONEON-NOSVE-NEXT: strb w8, [sp, #559] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #480] +; NONEON-NOSVE-NEXT: strb w8, [sp, #558] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #464] +; NONEON-NOSVE-NEXT: strb w8, [sp, #557] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #448] +; NONEON-NOSVE-NEXT: strb w8, [sp, #556] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #432] +; NONEON-NOSVE-NEXT: strb w8, [sp, #555] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #416] +; NONEON-NOSVE-NEXT: strb w8, [sp, #554] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #400] +; NONEON-NOSVE-NEXT: strb w8, [sp, #553] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #384] +; NONEON-NOSVE-NEXT: strb w8, [sp, #552] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #368] +; NONEON-NOSVE-NEXT: strb w8, [sp, #551] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #352] +; NONEON-NOSVE-NEXT: strb w8, [sp, #550] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #336] +; NONEON-NOSVE-NEXT: strb w8, [sp, #549] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #320] +; NONEON-NOSVE-NEXT: strb w8, [sp, #548] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #304] +; NONEON-NOSVE-NEXT: strb w8, [sp, #547] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #288] +; NONEON-NOSVE-NEXT: strb w8, [sp, #546] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #272] +; NONEON-NOSVE-NEXT: strb w8, [sp, #545] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #256] +; NONEON-NOSVE-NEXT: strb w8, [sp, #544] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #544] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #576 +; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op) @@ -443,11 +1255,23 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: fmov d1, x9 ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h +; NONEON-NOSVE-NEXT: cnt v1.8b, v1.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h1, v1.8b +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -464,8 +1288,39 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v4i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #66] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -482,8 +1337,67 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #142] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #140] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #138] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #136] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #134] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #132] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #130] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #128] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #144] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -501,12 +1415,128 @@ define void @ctpop_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctpop_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #256] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #286] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #284] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #282] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #280] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #278] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #276] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #274] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #272] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #270] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #268] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #240] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #266] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #224] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #264] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #262] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #260] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #176] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #258] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #160] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #256] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #318] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #314] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #310] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #306] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #240] +; NONEON-NOSVE-NEXT: strh w8, [sp, #302] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #208] +; NONEON-NOSVE-NEXT: strh w8, [sp, #298] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: strh w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #294] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: strh w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: strh w8, [sp, #290] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #288] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op) @@ -525,9 +1555,24 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v2i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -544,9 +1589,37 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #80] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -564,14 +1637,65 @@ define void @ctpop_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctpop_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 192 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #128] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #156] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #160] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op) @@ -590,10 +1714,15 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h -; NONEON-NOSVE-NEXT: uaddlp v0.1d, v0.2s +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -610,10 +1739,23 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str x8, [sp, #56] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -631,16 +1773,37 @@ define void @ctpop_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctpop_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s -; NONEON-NOSVE-NEXT: uaddlp v1.2d, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str x8, [sp, #120] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str x8, [sp, #112] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str x8, [sp, #104] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str x8, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op) @@ -665,17 +1828,30 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #256 // =0x100 -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: sub v1.4h, v0.4h, v2.4h -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h -; NONEON-NOSVE-NEXT: sub v0.4h, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %op) ret <4 x i8> %res @@ -693,10 +1869,50 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.8b, #1 -; NONEON-NOSVE-NEXT: sub v1.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op) ret <8 x i8> %res @@ -714,10 +1930,90 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.16b, #1 -; NONEON-NOSVE-NEXT: sub v1.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op) ret <16 x i8> %res @@ -737,15 +2033,172 @@ define void @cttz_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: cttz_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #1 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v3.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: sub v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op) @@ -766,17 +2219,19 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #65536 // =0x10000 -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: sub v1.2s, v0.2s, v2.2s -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s -; NONEON-NOSVE-NEXT: sub v0.2s, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -794,14 +2249,30 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: sub v1.4h, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h -; NONEON-NOSVE-NEXT: sub v0.4h, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -819,14 +2290,50 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.8h, w8 -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: sub v1.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: dup v1.8h, w8 -; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -846,20 +2353,92 @@ define void @cttz_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: cttz_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: sub v3.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: dup v2.8h, w8 -; NONEON-NOSVE-NEXT: clz v1.8h, v1.8h -; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v1.8h, v2.8h, v1.8h -; NONEON-NOSVE-NEXT: sub v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op) @@ -879,14 +2458,17 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 -; NONEON-NOSVE-NEXT: sub v1.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s -; NONEON-NOSVE-NEXT: sub v0.2s, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -904,14 +2486,24 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.4s, w8 -; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 -; NONEON-NOSVE-NEXT: sub v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: dup v1.4s, w8 -; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -931,20 +2523,40 @@ define void @cttz_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: cttz_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 -; NONEON-NOSVE-NEXT: sub v3.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: dup v2.4s, w8 -; NONEON-NOSVE-NEXT: clz v1.4s, v1.4s -; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v1.4s, v2.4s, v1.4s -; NONEON-NOSVE-NEXT: sub v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op) @@ -964,14 +2576,14 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: sub d1, d0, d1 -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h -; NONEON-NOSVE-NEXT: uaddlp v0.1d, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -989,14 +2601,17 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.2d, x8 -; NONEON-NOSVE-NEXT: sub v1.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -1016,22 +2631,26 @@ define void @cttz_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: cttz_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: sub v3.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: sub v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v1.2d, v1.4s -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll index 7e93ee99ed7494..41065b36020038 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll @@ -15,8 +15,14 @@ define void @bitcast_v4i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: bitcast_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr w8, [x0] -; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: ldrb w8, [x0] +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: ldrb w11, [x0, #3] +; NONEON-NOSVE-NEXT: strb w11, [x1, #3] +; NONEON-NOSVE-NEXT: strb w10, [x1, #2] +; NONEON-NOSVE-NEXT: strb w9, [x1, #1] +; NONEON-NOSVE-NEXT: strb w8, [x1] ; NONEON-NOSVE-NEXT: ret %load = load volatile <4 x i8>, ptr %a %cast = bitcast <4 x i8> %load to <4 x i8> @@ -102,12 +108,22 @@ define void @bitcast_v2i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: bitcast_v2i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #2] +; NONEON-NOSVE-NEXT: str w8, [sp, #4] ; NONEON-NOSVE-NEXT: ldrh w8, [x0] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: add x8, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] -; NONEON-NOSVE-NEXT: uzp1 v0.4h, v0.4h, v0.4h -; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: str w8, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %load = load volatile <2 x i16>, ptr %a %cast = bitcast <2 x i16> %load to <2 x half> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll index 6b8077053b590f..b908dd61f24014 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll @@ -34,13 +34,39 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r ; ; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] ; NONEON-NOSVE-NEXT: ldp q5, q4, [x2] -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: neg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v5.16b ; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %pre_cond = load <8 x i32>, ptr %pre_cond_ptr %left = load <8 x i32>, ptr %left_ptr diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll index d2bfc7d4e80969..a845c3cbdc2b6d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll @@ -44,7 +44,27 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> ret <8 x i8> %res @@ -62,9 +82,9 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> @@ -152,7 +172,17 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> ret <4 x i16> %res @@ -171,9 +201,9 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> ret <8 x i16> %res @@ -243,7 +273,14 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> ret <2 x i32> %res @@ -262,9 +299,9 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> ret <4 x i32> %res @@ -332,9 +369,9 @@ define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> ret <2 x i64> %res @@ -407,7 +444,14 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> ret <4 x half> %res @@ -425,9 +469,9 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> ret <8 x half> %res @@ -497,7 +541,14 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> ret <2 x float> %res @@ -516,9 +567,9 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> ret <4 x float> %res @@ -586,9 +637,9 @@ define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> ret <2 x double> %res @@ -732,7 +783,11 @@ define void @concat_v32i8_4op(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: concat_v32i8_4op: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i8>, ptr %a %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> , ptr %a %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> @@ -775,7 +834,11 @@ define void @concat_v8i32_4op(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: concat_v8i32_4op: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i32>, ptr %a %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> @@ -794,7 +857,11 @@ define void @concat_v4i64_4op(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: concat_v4i64_4op: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <1 x i64>, ptr %a %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 728b85d39bb37f..2cdd4374a56c5c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -15,8 +15,28 @@ define <8 x i16> @load_zext_v8i8i16(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_zext_v8i8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i8>, ptr %ap %val = zext <8 x i8> %a to <8 x i16> @@ -33,8 +53,18 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_zext_v4i16i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i16>, ptr %ap %val = zext <4 x i16> %a to <4 x i32> @@ -51,8 +81,15 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_zext_v2i32i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <2 x i32>, ptr %ap %val = zext <2 x i32> %a to <2 x i64> @@ -77,13 +114,14 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) { ; NONEON-NOSVE-LABEL: load_zext_v2i64i256: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp x0, x4, [sp], #16 ; NONEON-NOSVE-NEXT: mov x1, xzr ; NONEON-NOSVE-NEXT: mov x2, xzr ; NONEON-NOSVE-NEXT: mov x3, xzr ; NONEON-NOSVE-NEXT: mov x5, xzr ; NONEON-NOSVE-NEXT: mov x6, xzr -; NONEON-NOSVE-NEXT: mov x4, v0.d[1] -; NONEON-NOSVE-NEXT: fmov x0, d0 ; NONEON-NOSVE-NEXT: mov x7, xzr ; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %ap @@ -110,20 +148,75 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_sext_v16i8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: sshll v1.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v2.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ldr d3, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v1.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #128] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i8>, ptr %ap %val = sext <16 x i8> %a to <16 x i32> @@ -144,12 +237,24 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap) { ; NONEON-NOSVE-LABEL: load_sext_v8i16i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i16>, ptr %ap %val = sext <8 x i16> %a to <8 x i32> @@ -186,34 +291,31 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) { ; NONEON-NOSVE-LABEL: load_sext_v4i32i256: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: add x10, x8, #32 -; NONEON-NOSVE-NEXT: add x11, x8, #96 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: mov x9, v0.d[1] -; NONEON-NOSVE-NEXT: st1 { v0.d }[1], [x10] -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: st1 { v1.d }[1], [x11] -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: asr x10, x10, #63 -; NONEON-NOSVE-NEXT: str d0, [x8] -; NONEON-NOSVE-NEXT: asr x9, x9, #63 -; NONEON-NOSVE-NEXT: str d1, [x8, #64] -; NONEON-NOSVE-NEXT: stp x10, x10, [x8, #16] -; NONEON-NOSVE-NEXT: stp x9, x9, [x8, #48] -; NONEON-NOSVE-NEXT: str x9, [x8, #40] -; NONEON-NOSVE-NEXT: fmov x9, d1 -; NONEON-NOSVE-NEXT: str x10, [x8, #8] -; NONEON-NOSVE-NEXT: asr x10, x11, #63 -; NONEON-NOSVE-NEXT: asr x9, x9, #63 +; NONEON-NOSVE-NEXT: str q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #24] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #16] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldp x11, x9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp x12, x13, [sp, #80] +; NONEON-NOSVE-NEXT: asr x10, x9, #63 +; NONEON-NOSVE-NEXT: asr x14, x11, #63 ; NONEON-NOSVE-NEXT: stp x10, x10, [x8, #112] -; NONEON-NOSVE-NEXT: str x10, [x8, #104] -; NONEON-NOSVE-NEXT: stp x9, x9, [x8, #80] -; NONEON-NOSVE-NEXT: str x9, [x8, #72] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp x9, x10, [x8, #96] +; NONEON-NOSVE-NEXT: asr x9, x13, #63 +; NONEON-NOSVE-NEXT: asr x10, x12, #63 +; NONEON-NOSVE-NEXT: stp x14, x14, [x8, #80] +; NONEON-NOSVE-NEXT: stp x11, x14, [x8, #64] +; NONEON-NOSVE-NEXT: stp x9, x9, [x8, #48] +; NONEON-NOSVE-NEXT: stp x13, x9, [x8, #32] +; NONEON-NOSVE-NEXT: stp x10, x10, [x8, #16] +; NONEON-NOSVE-NEXT: stp x12, x10, [x8] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %ap %val = sext <4 x i32> %a to <4 x i256> @@ -251,18 +353,26 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_sext_v2i64i256: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: dup v1.2d, v0.d[1] -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: asr x1, x0, #63 -; NONEON-NOSVE-NEXT: asr x5, x8, #63 -; NONEON-NOSVE-NEXT: mov x2, x1 -; NONEON-NOSVE-NEXT: mov x3, x1 -; NONEON-NOSVE-NEXT: mov v1.d[1], x5 -; NONEON-NOSVE-NEXT: mov x6, x5 -; NONEON-NOSVE-NEXT: mov x7, x5 -; NONEON-NOSVE-NEXT: fmov x4, d1 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: asr x8, x10, #63 +; NONEON-NOSVE-NEXT: stp x9, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp x10, x8, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x8, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp x0, x1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp x2, x3, [sp, #80] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #112] +; NONEON-NOSVE-NEXT: ldp x4, x5, [sp, #128] +; NONEON-NOSVE-NEXT: ldp x6, x7, [sp, #112] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %ap %val = sext <2 x i64> %a to <2 x i256> @@ -300,30 +410,88 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_zext_v16i16i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushll v2.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v3.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v4.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v5.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v2.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d6, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] -; NONEON-NOSVE-NEXT: stp q5, q3, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d16, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d17, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v1.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: ushll v6.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ushll v5.2d, v16.2s, #0 -; NONEON-NOSVE-NEXT: ushll v7.2d, v17.2s, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: str wzr, [sp, #316] +; NONEON-NOSVE-NEXT: str wzr, [sp, #308] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: str wzr, [sp, #300] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str wzr, [sp, #292] +; NONEON-NOSVE-NEXT: str wzr, [sp, #284] +; NONEON-NOSVE-NEXT: str wzr, [sp, #276] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: str wzr, [sp, #268] +; NONEON-NOSVE-NEXT: str wzr, [sp, #260] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #152] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #136] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: str d1, [sp, #328] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #248] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #240] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #224] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #216] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #208] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #192] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #332] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #200] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #192] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: str w9, [sp, #296] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q5, q4, [sp, #288] +; NONEON-NOSVE-NEXT: str w9, [sp, #280] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: str w9, [sp, #264] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #256] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %ap %val = zext <16 x i16> %a to <16 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll index ec6341d6085a0a..b7b34cfa1517ce 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll @@ -31,7 +31,18 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v8i1: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> %op, i64 4) ret <4 x i1> %ret @@ -63,7 +74,18 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4) ret <4 x i8> %ret @@ -178,8 +200,12 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str w8, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1) ret <1 x i32> %ret @@ -275,8 +301,12 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[1] +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2) ret <2 x half> %ret @@ -331,8 +361,12 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str w8, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1) ret <1 x float> %ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll index ac60a614d7ce6c..0a1831a94d8fec 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll @@ -19,8 +19,11 @@ define half @extractelement_v2f16(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h0, v0.h[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <2 x half> %op1, i64 1 ret half %r @@ -36,8 +39,11 @@ define half @extractelement_v4f16(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <4 x half> %op1, i64 3 ret half %r @@ -53,7 +59,10 @@ define half @extractelement_v8f16(<8 x half> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <8 x half> %op1, i64 7 ret half %r @@ -69,7 +78,11 @@ define half @extractelement_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: extractelement_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr h0, [x0, #30] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %r = extractelement <16 x half> %op1, i64 15 @@ -86,8 +99,11 @@ define float @extractelement_v2f32(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov s0, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <2 x float> %op1, i64 1 ret float %r @@ -103,7 +119,10 @@ define float @extractelement_v4f32(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov s0, v0.s[3] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <4 x float> %op1, i64 3 ret float %r @@ -119,7 +138,11 @@ define float @extractelement_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: extractelement_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0, #28] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %r = extractelement <8 x float> %op1, i64 7 @@ -147,7 +170,10 @@ define double @extractelement_v2f64(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov d0, v0.d[1] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <2 x double> %op1, i64 1 ret double %r @@ -163,7 +189,11 @@ define double @extractelement_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: extractelement_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0, #24] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %r = extractelement <4 x double> %op1, i64 3 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index c1d84f6a15ed8c..a8d01ec7ce0b4b 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -32,12 +32,58 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [x1] ; NONEON-NOSVE-NEXT: ldr d1, [x0] -; NONEON-NOSVE-NEXT: ldr d2, [x1] -; NONEON-NOSVE-NEXT: dup v0.4h, w8 -; NONEON-NOSVE-NEXT: bsl v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x half>, ptr %bp @@ -68,12 +114,102 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldr q0, [x1] ; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x1] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: bsl v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: str h0, [sp, #4] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %a = load <8 x half>, ptr %ap %b = load <8 x half>, ptr %bp @@ -108,13 +244,191 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v16f16_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff -; NONEON-NOSVE-NEXT: ldp q1, q4, [x1] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] ; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #96] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #126] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #124] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #122] +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #120] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #118] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #116] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #114] +; NONEON-NOSVE-NEXT: str h0, [sp, #4] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #112] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #94] +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #92] +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #90] +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #88] +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #86] +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #84] +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #82] +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #80] +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #110] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #158] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #108] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #156] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #106] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #154] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #104] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #152] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #102] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #150] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #100] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #148] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #98] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #146] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #96] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #144] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #78] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #142] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #76] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #140] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #74] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #138] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #72] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #136] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #70] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #134] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #68] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #132] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #66] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #130] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x half>, ptr %ap %b = load <16 x half>, ptr %bp @@ -147,12 +461,26 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d0, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr d1, [x0] -; NONEON-NOSVE-NEXT: ldr d2, [x1] -; NONEON-NOSVE-NEXT: fneg v0.2s, v0.2s -; NONEON-NOSVE-NEXT: bsl v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ldr d1, [x1] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <2 x float>, ptr %ap %b = load <2 x float>, ptr %bp @@ -183,12 +511,37 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x1] -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: bsl v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x float>, ptr %ap %b = load <4 x float>, ptr %bp @@ -223,13 +576,63 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v8f32_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldp q1, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <8 x float>, ptr %ap %b = load <8 x float>, ptr %bp @@ -262,12 +665,25 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x1] -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: bsl v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <2 x double>, ptr %ap %b = load <2 x double>, ptr %bp @@ -302,13 +718,39 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldp q1, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x double>, ptr %bp @@ -347,13 +789,27 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d0, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: ldr d2, [x0] -; NONEON-NOSVE-NEXT: fcvtn v1.2s, v1.2d -; NONEON-NOSVE-NEXT: fneg v0.2s, v0.2s -; NONEON-NOSVE-NEXT: bsl v0.8b, v2.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: str d1, [sp, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <2 x float>, ptr %ap %b = load <2 x double>, ptr %bp @@ -402,14 +858,39 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: fcvtn v1.2s, v1.2d -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.4s, v2.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] ; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %a = load <4 x float>, ptr %ap %b = load <4 x double>, ptr %bp @@ -447,13 +928,27 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <2 x double>, ptr %ap %b = load < 2 x float>, ptr %bp @@ -502,19 +997,41 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtl v4.2d, v4.2s -; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #16] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x float>, ptr %bp @@ -554,13 +1071,49 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x1] -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff -; NONEON-NOSVE-NEXT: ldr d2, [x0] -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: bit v0.8b, v2.8b, v1.8b +; NONEON-NOSVE-NEXT: str d1, [sp, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x float>, ptr %bp @@ -620,21 +1173,49 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x1] -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff -; NONEON-NOSVE-NEXT: mov d1, v0.d[1] -; NONEON-NOSVE-NEXT: fcvt h0, d0 -; NONEON-NOSVE-NEXT: fcvt h1, d1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, d2 -; NONEON-NOSVE-NEXT: mov d2, v2.d[1] -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] ; NONEON-NOSVE-NEXT: ldr d2, [x0] -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: bit v0.8b, v2.8b, v1.8b +; NONEON-NOSVE-NEXT: str d2, [sp, #8] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x double>, ptr %bp @@ -682,14 +1263,83 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] ; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s -; NONEON-NOSVE-NEXT: dup v1.8h, w8 -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %a = load <8 x half>, ptr %ap %b = load <8 x float>, ptr %bp diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll index b51b89d08844d0..e84acfc8504a95 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll @@ -21,10 +21,39 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fadd <2 x half> %op1, %op2 ret <2 x half> %res @@ -42,10 +71,39 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fadd <4 x half> %op1, %op2 ret <4 x half> %res @@ -63,14 +121,66 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fadd v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fadd <8 x half> %op1, %op2 ret <8 x half> %res @@ -90,25 +200,127 @@ define void @fadd_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fadd v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fadd v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -129,7 +341,17 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fadd v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fadd <2 x float> %op1, %op2 ret <2 x float> %res @@ -147,7 +369,22 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fadd <4 x float> %op1, %op2 ret <4 x float> %res @@ -167,11 +404,39 @@ define void @fadd_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -192,7 +457,16 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fadd v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fadd <2 x double> %op1, %op2 ret <2 x double> %res @@ -212,11 +486,27 @@ define void @fadd_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fadd v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -241,10 +531,39 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fdiv <2 x half> %op1, %op2 ret <2 x half> %res @@ -262,10 +581,39 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fdiv <4 x half> %op1, %op2 ret <4 x half> %res @@ -283,14 +631,66 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fdiv v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fdiv v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fdiv <8 x half> %op1, %op2 ret <8 x half> %res @@ -310,26 +710,127 @@ define void @fdiv_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fdiv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q4, q1, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v5.4s, v4.8h -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v4.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fdiv v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: ldr q3, [x0] -; NONEON-NOSVE-NEXT: fcvtl2 v6.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fdiv v3.4s, v3.4s, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fdiv v5.4s, v6.4s, v5.4s -; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -350,7 +851,17 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fdiv v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fdiv <2 x float> %op1, %op2 ret <2 x float> %res @@ -368,7 +879,22 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fdiv <4 x float> %op1, %op2 ret <4 x float> %res @@ -388,11 +914,39 @@ define void @fdiv_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fdiv_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fdiv v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fdiv v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -413,7 +967,16 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fdiv v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fdiv d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fdiv <2 x double> %op1, %op2 ret <2 x double> %res @@ -433,11 +996,27 @@ define void @fdiv_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fdiv_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fdiv v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fdiv v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fdiv d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fdiv d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fdiv d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -463,42 +1042,48 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d2 killed $d2 def $q2 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: fcvt s16, h0 -; NONEON-NOSVE-NEXT: mov h17, v2.h[2] -; NONEON-NOSVE-NEXT: mov h18, v1.h[2] -; NONEON-NOSVE-NEXT: mov h19, v0.h[2] -; NONEON-NOSVE-NEXT: mov h2, v2.h[3] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fmadd s6, s16, s7, s6 -; NONEON-NOSVE-NEXT: mov h16, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s7, h19 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmadd s3, s5, s4, s3 -; NONEON-NOSVE-NEXT: fcvt s4, h17 -; NONEON-NOSVE-NEXT: fcvt s5, h18 -; NONEON-NOSVE-NEXT: fcvt h0, s6 -; NONEON-NOSVE-NEXT: fmadd s4, s7, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h16 -; NONEON-NOSVE-NEXT: mov v0.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fmadd s1, s5, s1, s2 -; NONEON-NOSVE-NEXT: mov v0.h[2], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h2, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3) ret <2 x half> %res @@ -517,42 +1102,48 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d2 killed $d2 def $q2 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: fcvt s16, h0 -; NONEON-NOSVE-NEXT: mov h17, v2.h[2] -; NONEON-NOSVE-NEXT: mov h18, v1.h[2] -; NONEON-NOSVE-NEXT: mov h19, v0.h[2] -; NONEON-NOSVE-NEXT: mov h2, v2.h[3] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fmadd s6, s16, s7, s6 -; NONEON-NOSVE-NEXT: mov h16, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s7, h19 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmadd s3, s5, s4, s3 -; NONEON-NOSVE-NEXT: fcvt s4, h17 -; NONEON-NOSVE-NEXT: fcvt s5, h18 -; NONEON-NOSVE-NEXT: fcvt h0, s6 -; NONEON-NOSVE-NEXT: fmadd s4, s7, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h16 -; NONEON-NOSVE-NEXT: mov v0.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fmadd s1, s5, s1, s2 -; NONEON-NOSVE-NEXT: mov v0.h[2], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h2, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ret <4 x half> %res @@ -571,75 +1162,84 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h3, v2.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: fcvt s16, h0 -; NONEON-NOSVE-NEXT: mov h17, v2.h[2] -; NONEON-NOSVE-NEXT: mov h18, v1.h[2] -; NONEON-NOSVE-NEXT: mov h19, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fmadd s6, s16, s7, s6 -; NONEON-NOSVE-NEXT: fcvt s7, h17 -; NONEON-NOSVE-NEXT: fcvt s16, h18 -; NONEON-NOSVE-NEXT: fcvt s17, h19 -; NONEON-NOSVE-NEXT: mov h18, v1.h[3] -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: fmadd s4, s5, s4, s3 -; NONEON-NOSVE-NEXT: mov h5, v2.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fmadd s6, s17, s16, s7 -; NONEON-NOSVE-NEXT: mov h17, v2.h[4] -; NONEON-NOSVE-NEXT: fcvt s7, h18 -; NONEON-NOSVE-NEXT: fcvt s16, h19 -; NONEON-NOSVE-NEXT: mov h18, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h19, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov v3.h[1], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: fmadd s5, s16, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: mov v3.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: mov h6, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fmadd s17, s19, s18, s17 -; NONEON-NOSVE-NEXT: mov h18, v1.h[6] -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmadd s4, s16, s7, s4 -; NONEON-NOSVE-NEXT: mov v3.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h18 -; NONEON-NOSVE-NEXT: fcvt s7, h19 -; NONEON-NOSVE-NEXT: fcvt h16, s17 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h2, [sp] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fmadd s5, s7, s6, s5 -; NONEON-NOSVE-NEXT: mov v3.h[4], v16.h[0] -; NONEON-NOSVE-NEXT: fmadd s0, s0, s1, s2 -; NONEON-NOSVE-NEXT: mov v3.h[5], v4.h[0] -; NONEON-NOSVE-NEXT: fcvt h4, s5 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v3.h[6], v4.h[0] -; NONEON-NOSVE-NEXT: mov v3.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v3.16b +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ret <8 x half> %res @@ -660,146 +1260,161 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q3, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q4, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q5, q2, [x2] -; NONEON-NOSVE-NEXT: mov h25, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: mov h24, v0.h[2] -; NONEON-NOSVE-NEXT: mov h17, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s18, h1 -; NONEON-NOSVE-NEXT: mov h22, v1.h[2] -; NONEON-NOSVE-NEXT: mov h16, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: mov h20, v2.h[2] -; NONEON-NOSVE-NEXT: mov h26, v5.h[1] -; NONEON-NOSVE-NEXT: mov h27, v4.h[1] -; NONEON-NOSVE-NEXT: mov h28, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s25, h25 -; NONEON-NOSVE-NEXT: mov h7, v2.h[3] -; NONEON-NOSVE-NEXT: mov h29, v4.h[2] -; NONEON-NOSVE-NEXT: fcvt s23, h17 -; NONEON-NOSVE-NEXT: mov h17, v0.h[3] -; NONEON-NOSVE-NEXT: mov h30, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s21, h16 -; NONEON-NOSVE-NEXT: fmadd s6, s19, s18, s6 -; NONEON-NOSVE-NEXT: fcvt s18, h20 -; NONEON-NOSVE-NEXT: fcvt s19, h22 -; NONEON-NOSVE-NEXT: fcvt s20, h24 -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s22, h5 -; NONEON-NOSVE-NEXT: fcvt s24, h4 -; NONEON-NOSVE-NEXT: fcvt s26, h26 -; NONEON-NOSVE-NEXT: fcvt s27, h27 -; NONEON-NOSVE-NEXT: fcvt s28, h28 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fmadd s21, s25, s23, s21 -; NONEON-NOSVE-NEXT: fcvt s23, h3 -; NONEON-NOSVE-NEXT: mov h25, v5.h[2] -; NONEON-NOSVE-NEXT: fmadd s18, s20, s19, s18 -; NONEON-NOSVE-NEXT: mov h19, v3.h[2] -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: mov h31, v0.h[4] -; NONEON-NOSVE-NEXT: fmadd s26, s28, s27, s26 -; NONEON-NOSVE-NEXT: mov h27, v4.h[3] -; NONEON-NOSVE-NEXT: mov h28, v3.h[3] -; NONEON-NOSVE-NEXT: fmadd s22, s23, s24, s22 -; NONEON-NOSVE-NEXT: fcvt h20, s21 -; NONEON-NOSVE-NEXT: mov h21, v2.h[4] -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt s24, h29 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fmadd s16, s17, s16, s7 -; NONEON-NOSVE-NEXT: mov h25, v5.h[3] -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt h26, s26 -; NONEON-NOSVE-NEXT: mov h29, v2.h[5] -; NONEON-NOSVE-NEXT: mov v6.h[1], v20.h[0] -; NONEON-NOSVE-NEXT: fcvt s17, h21 -; NONEON-NOSVE-NEXT: fcvt s20, h30 -; NONEON-NOSVE-NEXT: fmadd s19, s19, s24, s23 -; NONEON-NOSVE-NEXT: fcvt s21, h31 -; NONEON-NOSVE-NEXT: fcvt h7, s22 -; NONEON-NOSVE-NEXT: fcvt s22, h25 -; NONEON-NOSVE-NEXT: fcvt s23, h27 -; NONEON-NOSVE-NEXT: fcvt s24, h28 -; NONEON-NOSVE-NEXT: mov h25, v5.h[4] -; NONEON-NOSVE-NEXT: mov h27, v4.h[4] -; NONEON-NOSVE-NEXT: mov h28, v3.h[4] -; NONEON-NOSVE-NEXT: mov h30, v1.h[5] -; NONEON-NOSVE-NEXT: mov h31, v0.h[5] -; NONEON-NOSVE-NEXT: mov v6.h[2], v18.h[0] -; NONEON-NOSVE-NEXT: fmadd s17, s21, s20, s17 -; NONEON-NOSVE-NEXT: mov v7.h[1], v26.h[0] -; NONEON-NOSVE-NEXT: fcvt h18, s19 -; NONEON-NOSVE-NEXT: fmadd s19, s24, s23, s22 -; NONEON-NOSVE-NEXT: mov h26, v5.h[5] -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s20, h25 -; NONEON-NOSVE-NEXT: fcvt s21, h27 -; NONEON-NOSVE-NEXT: fcvt s22, h28 -; NONEON-NOSVE-NEXT: mov h27, v4.h[5] -; NONEON-NOSVE-NEXT: mov h28, v3.h[5] -; NONEON-NOSVE-NEXT: fcvt s23, h29 -; NONEON-NOSVE-NEXT: fcvt s24, h30 -; NONEON-NOSVE-NEXT: fcvt s25, h31 -; NONEON-NOSVE-NEXT: mov h29, v2.h[6] -; NONEON-NOSVE-NEXT: mov h30, v1.h[6] -; NONEON-NOSVE-NEXT: mov h31, v0.h[6] -; NONEON-NOSVE-NEXT: mov v7.h[2], v18.h[0] -; NONEON-NOSVE-NEXT: fcvt h18, s19 -; NONEON-NOSVE-NEXT: fmadd s19, s22, s21, s20 -; NONEON-NOSVE-NEXT: mov h20, v5.h[6] -; NONEON-NOSVE-NEXT: mov h21, v4.h[6] -; NONEON-NOSVE-NEXT: mov h22, v3.h[6] -; NONEON-NOSVE-NEXT: fcvt s26, h26 -; NONEON-NOSVE-NEXT: fmadd s23, s25, s24, s23 -; NONEON-NOSVE-NEXT: fcvt s27, h27 -; NONEON-NOSVE-NEXT: fcvt s28, h28 -; NONEON-NOSVE-NEXT: mov v6.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s17 -; NONEON-NOSVE-NEXT: fcvt s17, h29 -; NONEON-NOSVE-NEXT: fcvt s24, h30 -; NONEON-NOSVE-NEXT: fcvt s25, h31 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: fcvt s21, h21 -; NONEON-NOSVE-NEXT: fcvt s22, h22 -; NONEON-NOSVE-NEXT: mov v7.h[3], v18.h[0] -; NONEON-NOSVE-NEXT: fmadd s26, s28, s27, s26 -; NONEON-NOSVE-NEXT: fcvt h18, s19 -; NONEON-NOSVE-NEXT: mov h5, v5.h[7] -; NONEON-NOSVE-NEXT: mov h4, v4.h[7] -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: fmadd s17, s25, s24, s17 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmadd s19, s22, s21, s20 -; NONEON-NOSVE-NEXT: mov v6.h[4], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s23 -; NONEON-NOSVE-NEXT: mov v7.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fcvt h18, s26 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #94] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #126] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #92] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #124] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #90] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #56] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #122] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #88] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v6.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: mov v7.h[5], v18.h[0] -; NONEON-NOSVE-NEXT: fmadd s3, s3, s4, s5 -; NONEON-NOSVE-NEXT: fcvt h4, s19 -; NONEON-NOSVE-NEXT: fcvt h5, s17 -; NONEON-NOSVE-NEXT: fmadd s0, s0, s1, s2 -; NONEON-NOSVE-NEXT: mov v7.h[6], v4.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s3 -; NONEON-NOSVE-NEXT: mov v6.h[6], v5.h[0] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v7.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v6.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q7, q6, [x0] +; NONEON-NOSVE-NEXT: str h0, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #86] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #118] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #84] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #116] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #82] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #114] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #80] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #110] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #108] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #106] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #104] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #102] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #100] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h2, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #98] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -822,8 +1437,19 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o ; ; NONEON-NOSVE-LABEL: fma_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.2s, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) ret <2 x float> %res @@ -842,8 +1468,26 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o ; ; NONEON-NOSVE-LABEL: fma_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) ret <4 x float> %res @@ -864,12 +1508,45 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fmla v1.4s, v0.4s, v2.4s -; NONEON-NOSVE-NEXT: fmla v5.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #92] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #56] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #88] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #48] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #120] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #84] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #80] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #104] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -892,8 +1569,19 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double ; ; NONEON-NOSVE-LABEL: fma_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) ret <2 x double> %res @@ -914,12 +1602,31 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fmla v1.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: fmla v5.2d, v4.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp, #48] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #80] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp] +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #112] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -945,10 +1652,39 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fmul <2 x half> %op1, %op2 ret <2 x half> %res @@ -966,10 +1702,39 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fmul <4 x half> %op1, %op2 ret <4 x half> %res @@ -987,14 +1752,66 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fmul v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fmul v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fmul <8 x half> %op1, %op2 ret <8 x half> %res @@ -1014,25 +1831,127 @@ define void @fmul_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmul_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fmul v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fmul v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fmul v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmul v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1053,7 +1972,17 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fmul <2 x float> %op1, %op2 ret <2 x float> %res @@ -1071,7 +2000,22 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fmul <4 x float> %op1, %op2 ret <4 x float> %res @@ -1091,11 +2035,39 @@ define void @fmul_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmul_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmul v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmul v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -1116,7 +2088,16 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmul v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmul d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fmul <2 x double> %op1, %op2 ret <2 x double> %res @@ -1136,11 +2117,27 @@ define void @fmul_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmul_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmul v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmul v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fmul d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fmul d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmul d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -1164,8 +2161,30 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.4h, #128, lsl #8 -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fneg <2 x half> %op ret <2 x half> %res @@ -1182,8 +2201,30 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.4h, #128, lsl #8 -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fneg <4 x half> %op ret <4 x half> %res @@ -1200,8 +2241,50 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.8h, #128, lsl #8 -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fneg <8 x half> %op ret <8 x half> %res @@ -1219,11 +2302,92 @@ define void @fneg_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fneg_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.8h, #128, lsl #8 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = fneg <16 x half> %op @@ -1242,7 +2406,15 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fneg v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fneg <2 x float> %op ret <2 x float> %res @@ -1259,7 +2431,20 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fneg <4 x float> %op ret <4 x float> %res @@ -1277,10 +2462,32 @@ define void @fneg_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fneg_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fneg v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = fneg <8 x float> %op @@ -1299,7 +2506,15 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fneg d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fneg <2 x double> %op ret <2 x double> %res @@ -1317,10 +2532,22 @@ define void @fneg_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: fneg_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fneg v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fneg d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fneg d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = fneg <4 x double> %op @@ -1343,26 +2570,30 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: mov h3, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fsqrt s2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fsqrt s1, s1 -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fsqrt s3, s3 -; NONEON-NOSVE-NEXT: fsqrt s4, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s2 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s3 -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s4 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %op) ret <2 x half> %res @@ -1379,26 +2610,30 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: mov h3, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fsqrt s2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fsqrt s1, s1 -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fsqrt s3, s3 -; NONEON-NOSVE-NEXT: fsqrt s4, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s2 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s3 -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s4 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op) ret <4 x half> %res @@ -1415,44 +2650,50 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: mov h3, v0.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[3] -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: mov h7, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fsqrt s2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h0 -; NONEON-NOSVE-NEXT: fcvt h0, s2 -; NONEON-NOSVE-NEXT: fsqrt s1, s1 -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s3, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s3 -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s4, s4 -; NONEON-NOSVE-NEXT: fcvt h1, s4 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s5, s5 -; NONEON-NOSVE-NEXT: fcvt h1, s5 -; NONEON-NOSVE-NEXT: mov v0.h[4], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s6, s6 -; NONEON-NOSVE-NEXT: fcvt h1, s6 -; NONEON-NOSVE-NEXT: mov v0.h[5], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s7, s7 -; NONEON-NOSVE-NEXT: fcvt h1, s7 -; NONEON-NOSVE-NEXT: mov v0.h[6], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s2, s16 -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v0.h[7], v1.h[0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op) ret <8 x half> %res @@ -1470,85 +2711,92 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fsqrt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q16, [x0] -; NONEON-NOSVE-NEXT: mov h0, v1.h[1] -; NONEON-NOSVE-NEXT: mov h17, v16.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s18, h16 -; NONEON-NOSVE-NEXT: mov h19, v16.h[2] -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: mov h20, v16.h[3] -; NONEON-NOSVE-NEXT: mov h5, v1.h[4] -; NONEON-NOSVE-NEXT: mov h21, v16.h[4] -; NONEON-NOSVE-NEXT: mov h6, v1.h[5] -; NONEON-NOSVE-NEXT: mov h22, v16.h[5] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fsqrt s2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: mov h7, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s21, h21 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s22, h22 -; NONEON-NOSVE-NEXT: mov h23, v16.h[6] -; NONEON-NOSVE-NEXT: mov h16, v16.h[7] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s23, h23 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fsqrt s0, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[1], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s17, s17 -; NONEON-NOSVE-NEXT: fcvt h17, s17 -; NONEON-NOSVE-NEXT: fsqrt s18, s18 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: mov v18.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: fsqrt s3, s3 -; NONEON-NOSVE-NEXT: fcvt h0, s3 -; NONEON-NOSVE-NEXT: mov v2.h[2], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s19, s19 -; NONEON-NOSVE-NEXT: fcvt h17, s19 -; NONEON-NOSVE-NEXT: mov v18.h[2], v17.h[0] -; NONEON-NOSVE-NEXT: fsqrt s4, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s4 -; NONEON-NOSVE-NEXT: mov v2.h[3], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s20, s20 -; NONEON-NOSVE-NEXT: fcvt h3, s20 -; NONEON-NOSVE-NEXT: mov v18.h[3], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s5, s5 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: mov v2.h[4], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s21, s21 -; NONEON-NOSVE-NEXT: fcvt h3, s21 -; NONEON-NOSVE-NEXT: mov v18.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s6, s6 -; NONEON-NOSVE-NEXT: fcvt h0, s6 -; NONEON-NOSVE-NEXT: mov v2.h[5], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s22, s22 -; NONEON-NOSVE-NEXT: fcvt h3, s22 -; NONEON-NOSVE-NEXT: mov v18.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s7, s7 -; NONEON-NOSVE-NEXT: fcvt h0, s7 -; NONEON-NOSVE-NEXT: mov v2.h[6], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s23, s23 -; NONEON-NOSVE-NEXT: fcvt h3, s23 -; NONEON-NOSVE-NEXT: mov v18.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s16, s16 -; NONEON-NOSVE-NEXT: fcvt h3, s16 -; NONEON-NOSVE-NEXT: mov v18.h[7], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s1, s1 -; NONEON-NOSVE-NEXT: fcvt h0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q18, q2, [x0] +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op) @@ -1567,7 +2815,15 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsqrt v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op) ret <2 x float> %res @@ -1584,7 +2840,20 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsqrt v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op) ret <4 x float> %res @@ -1602,10 +2871,32 @@ define void @fsqrt_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fsqrt_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fsqrt v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fsqrt v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op) @@ -1624,7 +2915,15 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsqrt v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fsqrt d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fsqrt d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op) ret <2 x double> %res @@ -1642,10 +2941,22 @@ define void @fsqrt_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: fsqrt_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fsqrt v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fsqrt v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fsqrt d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fsqrt d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fsqrt d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fsqrt d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op) @@ -1669,10 +2980,39 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fsub v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fsub <2 x half> %op1, %op2 ret <2 x half> %res @@ -1690,10 +3030,39 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fsub v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fsub <4 x half> %op1, %op2 ret <4 x half> %res @@ -1711,14 +3080,66 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fsub v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fsub v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fsub <8 x half> %op1, %op2 ret <8 x half> %res @@ -1738,25 +3159,127 @@ define void @fsub_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fsub_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fsub v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fsub v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fsub v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fsub v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1777,7 +3300,17 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fsub <2 x float> %op1, %op2 ret <2 x float> %res @@ -1795,7 +3328,22 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsub v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fsub <4 x float> %op1, %op2 ret <4 x float> %res @@ -1815,11 +3363,39 @@ define void @fsub_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fsub_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fsub v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fsub v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -1840,7 +3416,16 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsub v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fsub d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fsub <2 x double> %op1, %op2 ret <2 x double> %res @@ -1860,11 +3445,27 @@ define void @fsub_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fsub_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fsub v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fsub v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fsub d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fsub d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fsub d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -1888,7 +3489,30 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: bic v0.4h, #128, lsl #8 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.fabs.v2f16(<2 x half> %op) ret <2 x half> %res @@ -1905,7 +3529,30 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: bic v0.4h, #128, lsl #8 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op) ret <4 x half> %res @@ -1922,7 +3569,50 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: bic v0.8h, #128, lsl #8 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op) ret <8 x half> %res @@ -1940,10 +3630,92 @@ define void @fabs_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fabs_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: bic v0.8h, #128, lsl #8 -; NONEON-NOSVE-NEXT: bic v1.8h, #128, lsl #8 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op) @@ -1962,7 +3734,15 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fabs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op) ret <2 x float> %res @@ -1979,7 +3759,20 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fabs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op) ret <4 x float> %res @@ -1997,10 +3790,32 @@ define void @fabs_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fabs_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fabs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fabs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op) @@ -2019,7 +3834,15 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fabs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op) ret <2 x double> %res @@ -2037,10 +3860,22 @@ define void @fabs_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: fabs_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fabs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fabs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fabs d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll index c5ed70c8a5f2f8..776b6918923ae9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll @@ -23,10 +23,24 @@ define <2 x i16> @fcmp_oeq_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <2 x half> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i16> @@ -46,10 +60,39 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <4 x half> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i16> @@ -69,61 +112,66 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcmp s3, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: mov h4, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[4] -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: fcmp s2, s5 -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h5, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[5] -; NONEON-NOSVE-NEXT: mov h4, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[6] -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <8 x half> %op1, %op2 %sext = sext <8 x i1> %cmp to <8 x i16> @@ -145,119 +193,127 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, eq -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, eq -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, eq -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, eq -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -280,7 +336,18 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcmeq v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <2 x float> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i32> @@ -300,7 +367,24 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <4 x float> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i32> @@ -322,11 +406,43 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fcmeq v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #56] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #48] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -347,7 +463,13 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v1f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcmeq d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fcmp d0, d1 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <1 x double> %op1, %op2 %sext = sext <1 x i1> %cmp to <1 x i64> @@ -367,7 +489,17 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcmeq v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp] +; NONEON-NOSVE-NEXT: fcmp d3, d2 +; NONEON-NOSVE-NEXT: csetm x9, eq +; NONEON-NOSVE-NEXT: fcmp d1, d0 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <2 x double> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i64> @@ -389,11 +521,29 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcmeq v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fcmeq v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #32] +; NONEON-NOSVE-NEXT: fcmp d3, d2 +; NONEON-NOSVE-NEXT: csetm x9, eq +; NONEON-NOSVE-NEXT: fcmp d1, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp] +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp d3, d2 +; NONEON-NOSVE-NEXT: csetm x9, eq +; NONEON-NOSVE-NEXT: fcmp d1, d0 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -426,135 +576,143 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ueq_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h2 -; NONEON-NOSVE-NEXT: mov h5, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: mov h7, v1.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s6, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[4] -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: csinv w12, w9, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s7, s5 -; NONEON-NOSVE-NEXT: mov h5, v2.h[5] -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: csinv w10, w9, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x1] -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: csinv w11, w9, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s6, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s6, h16 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: csinv w9, w9, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s7, s5 -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: mov h7, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w13, eq -; NONEON-NOSVE-NEXT: csinv w13, w13, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s6, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: mov h6, v0.h[2] -; NONEON-NOSVE-NEXT: mov h7, v1.h[2] -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: csinv w14, w14, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s4, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w15, eq -; NONEON-NOSVE-NEXT: csinv w15, w15, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s5, s3 -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w16, eq -; NONEON-NOSVE-NEXT: csinv w16, w16, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s4, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h3 -; NONEON-NOSVE-NEXT: fmov s2, w12 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w17, eq -; NONEON-NOSVE-NEXT: csinv w17, w17, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[4] -; NONEON-NOSVE-NEXT: fmov s3, w17 -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v3.h[1], w16 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v0.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov v2.h[2], w10 -; NONEON-NOSVE-NEXT: mov v3.h[2], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: mov h7, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w11 -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov v3.h[3], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: mov v2.h[4], w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v3.h[4], w8 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov v2.h[5], w13 -; NONEON-NOSVE-NEXT: mov v3.h[5], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] ; NONEON-NOSVE-NEXT: fcmp s1, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], w14 -; NONEON-NOSVE-NEXT: mov v3.h[6], w8 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: mov v2.h[7], w15 -; NONEON-NOSVE-NEXT: mov v3.h[7], w8 -; NONEON-NOSVE-NEXT: stp q3, q2, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -587,150 +745,158 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_one_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h2 -; NONEON-NOSVE-NEXT: mov h5, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: mov h7, v1.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s6, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[4] -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: csinv w12, w9, wzr, le -; NONEON-NOSVE-NEXT: fcmp s7, s5 -; NONEON-NOSVE-NEXT: mov h5, v2.h[5] -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: csinv w10, w9, wzr, le -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x1] -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: csinv w11, w9, wzr, le -; NONEON-NOSVE-NEXT: fcmp s6, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s6, h16 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: csinv w9, w9, wzr, le -; NONEON-NOSVE-NEXT: fcmp s7, s5 -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: mov h7, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w13, mi -; NONEON-NOSVE-NEXT: csinv w13, w13, wzr, le -; NONEON-NOSVE-NEXT: fcmp s6, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: mov h6, v0.h[2] -; NONEON-NOSVE-NEXT: mov h7, v1.h[2] -; NONEON-NOSVE-NEXT: csetm w14, mi -; NONEON-NOSVE-NEXT: csinv w14, w14, wzr, le -; NONEON-NOSVE-NEXT: fcmp s4, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w15, mi -; NONEON-NOSVE-NEXT: csinv w15, w15, wzr, le -; NONEON-NOSVE-NEXT: fcmp s5, s3 -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w16, mi -; NONEON-NOSVE-NEXT: csinv w16, w16, wzr, le -; NONEON-NOSVE-NEXT: fcmp s4, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h3 -; NONEON-NOSVE-NEXT: fmov s2, w12 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w17, mi -; NONEON-NOSVE-NEXT: csinv w17, w17, wzr, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[4] -; NONEON-NOSVE-NEXT: fmov s3, w17 -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: mov v3.h[1], w16 ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v0.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov v2.h[2], w10 -; NONEON-NOSVE-NEXT: mov v3.h[2], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: mov h7, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w11 -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov v3.h[3], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: mov v2.h[4], w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v3.h[4], w8 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov v2.h[5], w13 -; NONEON-NOSVE-NEXT: mov v3.h[5], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] ; NONEON-NOSVE-NEXT: fcmp s1, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], w14 -; NONEON-NOSVE-NEXT: mov v3.h[6], w8 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: mov v2.h[7], w15 -; NONEON-NOSVE-NEXT: mov v3.h[7], w8 -; NONEON-NOSVE-NEXT: stp q3, q2, [x2] -; NONEON-NOSVE-NEXT: ret - %op1 = load <16 x half>, ptr %a - %op2 = load <16 x half>, ptr %b - %cmp = fcmp one <16 x half> %op1, %op2 - %sext = sext <16 x i1> %cmp to <16 x i16> - store <16 x i16> %sext, ptr %c - ret void -} - -; -; FCMP UNE -; - -define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { -; CHECK-LABEL: fcmp_une_v16f16: +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret + %op1 = load <16 x half>, ptr %a + %op2 = load <16 x half>, ptr %b + %cmp = fcmp one <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, ptr %c + ret void +} + +; +; FCMP UNE +; + +define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { +; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 @@ -744,119 +910,127 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_une_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ne -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ne -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ne -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ne -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ne -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ne -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ne -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ne -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ne -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -885,119 +1059,127 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ogt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, gt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, gt -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, gt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, gt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, gt -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, gt -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, gt -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, gt -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, gt -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1029,119 +1211,127 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ugt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, hi -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, hi -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, hi -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, hi -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, hi -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, hi -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, hi -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, hi -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, hi -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1170,123 +1360,131 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_olt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, mi -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, mi -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, mi -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, mi -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, mi -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, mi -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, mi -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, mi -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] -; NONEON-NOSVE-NEXT: ret - %op1 = load <16 x half>, ptr %a - %op2 = load <16 x half>, ptr %b - %cmp = fcmp olt <16 x half> %op1, %op2 +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret + %op1 = load <16 x half>, ptr %a + %op2 = load <16 x half>, ptr %b + %cmp = fcmp olt <16 x half> %op1, %op2 %sext = sext <16 x i1> %cmp to <16 x i16> store <16 x i16> %sext, ptr %c ret void @@ -1314,119 +1512,127 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ult_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, lt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, lt -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, lt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, lt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, lt -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, lt -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, lt -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, lt -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, lt -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1455,119 +1661,127 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_oge_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ge -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ge -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ge -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ge -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ge -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ge -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ge -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ge -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ge -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1599,119 +1813,127 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_uge_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, pl -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, pl -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, pl -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, pl -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, pl -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, pl -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, pl -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, pl -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, pl -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1740,263 +1962,279 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ole_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ls -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ls -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ls -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ls -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ls -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ls -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ls -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ls -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ls -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret + %op1 = load <16 x half>, ptr %a + %op2 = load <16 x half>, ptr %b + %cmp = fcmp ole <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, ptr %c + ret void +} + +; +; FCMP ULE +; + +define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) { +; CHECK-LABEL: fcmp_ule_v16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d +; CHECK-NEXT: eor z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ule_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 -; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] -; NONEON-NOSVE-NEXT: ret - %op1 = load <16 x half>, ptr %a - %op2 = load <16 x half>, ptr %b - %cmp = fcmp ole <16 x half> %op1, %op2 - %sext = sext <16 x i1> %cmp to <16 x i16> - store <16 x i16> %sext, ptr %c - ret void -} - -; -; FCMP ULE -; - -define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) { -; CHECK-LABEL: fcmp_ule_v16f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z1.d, z1.d, z0.d -; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] -; CHECK-NEXT: ret -; -; NONEON-NOSVE-LABEL: fcmp_ule_v16f16: -; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, le -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, le -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, le -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, le -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, le -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, le -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2025,119 +2263,127 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_uno_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, vs -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, vs -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, vs -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, vs -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, vs -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, vs -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, vs -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, vs -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, vs -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2169,119 +2415,127 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ord_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, vc -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, vc -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, vc -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, vc -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, vc -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, vc -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, vc -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2310,119 +2564,127 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_eq_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, eq -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, eq -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, eq -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, eq -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2451,119 +2713,127 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ne_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ne -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ne -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ne -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ne -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ne -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ne -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ne -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ne -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ne -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2592,119 +2862,127 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_gt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, gt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, gt -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, gt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, gt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, gt -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, gt -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, gt -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, gt -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, gt -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2733,119 +3011,127 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_lt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, lt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, lt -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, lt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, lt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, lt -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, lt -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, lt -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, lt -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, lt -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2874,119 +3160,127 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ge_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ge -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ge -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ge -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ge -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ge -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ge -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ge -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ge -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ge -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -3015,119 +3309,127 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_le_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, le -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, le -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, le -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, le -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, le -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, le -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll index 055af194be211a..2c08977320e848 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll @@ -21,13 +21,28 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fp_convert_combine_crash: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov v0.4s, #8.00000000 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmul v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmul v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0, #3 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1, #3 +; NONEON-NOSVE-NEXT: fcvtzs w10, s2, #3 +; NONEON-NOSVE-NEXT: fcvtzs w11, s0, #3 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s0, s3, [sp] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w12, s1, #3 +; NONEON-NOSVE-NEXT: fcvtzs w8, s2, #3 +; NONEON-NOSVE-NEXT: stp w11, w10, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, s3, #3 +; NONEON-NOSVE-NEXT: fcvtzs w10, s0, #3 +; NONEON-NOSVE-NEXT: stp w8, w12, [sp, #40] +; NONEON-NOSVE-NEXT: stp w10, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %f = load <8 x float>, ptr %a %mul.i = fmul <8 x float> %f, %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f16_to_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fpext <2 x half> %a to <2 x float> store <2 x float> %res, ptr %b @@ -41,8 +49,22 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f16_to_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fpext <4 x half> %a to <4 x float> store <4 x float> %res, ptr %b @@ -64,13 +86,33 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = fpext <8 x half> %a to <8 x float> store <8 x float> %res, ptr %b @@ -99,17 +141,57 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: stp q0, q3, [x0] -; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %res = fpext <16 x half> %a to <16 x float> store <16 x float> %res, ptr %b @@ -132,9 +214,20 @@ define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x half>, ptr %a %res = fpext <2 x half> %op1 to <2 x float> @@ -153,9 +246,23 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fpext <4 x half> %op1 to <4 x float> @@ -178,13 +285,33 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fpext <8 x half> %op1 to <8 x float> @@ -214,17 +341,57 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fpext <16 x half> %op1 to <16 x float> @@ -246,9 +413,14 @@ define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v1f16_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: ldr h0, [x0] ; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <1 x half>, ptr %a %res = fpext <1 x half> %op1 to <1 x double> @@ -267,10 +439,26 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x half>, ptr %a %res = fpext <2 x half> %op1 to <2 x double> @@ -292,15 +480,35 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fpext <4 x half> %op1 to <4 x double> @@ -329,22 +537,61 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: stp q0, q2, [x1] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #92] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #88] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #144] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #84] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #80] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #128] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #72] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #68] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fpext <8 x half> %op1 to <8 x double> @@ -390,34 +637,115 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #72] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] -; NONEON-NOSVE-NEXT: fcvtl v5.2d, v5.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: fcvtl v4.2d, v4.2s -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v7.2s -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v6.2s -; NONEON-NOSVE-NEXT: stp q2, q0, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #66] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #128] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #70] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #68] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #152] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #136] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #120] +; NONEON-NOSVE-NEXT: str d1, [sp, #328] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #164] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #160] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #156] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #152] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #148] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #144] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #140] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #136] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #332] +; NONEON-NOSVE-NEXT: ldp q4, q3, [sp, #192] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #328] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #304] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #188] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #184] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #288] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #180] +; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #288] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #176] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #272] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #172] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #168] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #256] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224] +; NONEON-NOSVE-NEXT: ldp q2, q5, [sp, #256] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fpext <16 x half> %op1 to <16 x double> @@ -440,7 +768,7 @@ define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v1f32_v1f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvt d0, s0 ; NONEON-NOSVE-NEXT: str d0, [x1] ; NONEON-NOSVE-NEXT: ret %op1 = load <1 x float>, ptr %a @@ -460,9 +788,18 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x float>, ptr %a %res = fpext <2 x float> %op1 to <2 x double> @@ -485,13 +822,23 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fpext <4 x float> %op1 to <4 x double> @@ -521,17 +868,37 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fpext <8 x float> %op1 to <8 x double> @@ -554,9 +921,21 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x float>, ptr %a %res = fptrunc <2 x float> %op1 to <2 x half> @@ -576,8 +955,23 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fptrunc <4 x float> %op1 to <4 x half> @@ -599,10 +993,36 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptrunc <8 x float> %op1 to <8 x half> @@ -647,11 +1067,19 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: mov d1, v0.d[1] +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt h0, d0 -; NONEON-NOSVE-NEXT: fcvt h1, d1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x double>, ptr %a %res = fptrunc <2 x double> %op1 to <2 x half> @@ -673,17 +1101,24 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: mov d1, v0.d[1] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt h0, d0 -; NONEON-NOSVE-NEXT: fcvt h1, d1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, d2 -; NONEON-NOSVE-NEXT: mov d2, v2.d[1] -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, d2 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptrunc <4 x double> %op1 to <4 x half> @@ -706,8 +1141,7 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: fcvt s0, d0 ; NONEON-NOSVE-NEXT: str s0, [x0] ; NONEON-NOSVE-NEXT: ret %res = fptrunc <1 x double> %op1 to <1 x float> @@ -726,8 +1160,16 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, d0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptrunc <2 x double> %op1 to <2 x float> store <2 x float> %res, ptr %b @@ -748,10 +1190,22 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, d0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, d0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptrunc <4 x double> %op1 to <4 x float> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll index 9d2b55903f3141..680cb4fb0a7910 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll @@ -21,14 +21,59 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: str d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h6, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h7, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: ldr h5, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fmul s1, s3, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fmul s2, s7, s6 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s2, s0 +; NONEON-NOSVE-NEXT: fmul s2, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s2, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <4 x half> %op1, %op2 %res = fadd contract <4 x half> %mul, %op3 @@ -48,22 +93,107 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fmul v3.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v3.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: str q2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h22, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h23, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s3, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: ldr h20, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s22, h22 +; NONEON-NOSVE-NEXT: fcvt s23, h23 +; NONEON-NOSVE-NEXT: ldr h21, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s20, h20 +; NONEON-NOSVE-NEXT: ldr h18, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h19, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h16, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h17, [sp, #6] +; NONEON-NOSVE-NEXT: fmul s5, s1, s3 +; NONEON-NOSVE-NEXT: fcvt s21, h21 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: ldr h6, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h7, [sp, #4] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h4, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fmul s3, s4, s3 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s23, s22 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: str h2, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s21, s20 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: str h2, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s19, s18 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: str h2, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s17, s16 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: str h2, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s7, s6 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: str h2, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: str h2, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s3, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: str h1, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <8 x half> %op1, %op2 %res = fadd contract <8 x half> %mul, %op3 @@ -85,42 +215,228 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fmul v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fmul v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fmul v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: ldp q0, q2, [x2] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fadd v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fadd v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #208 +; NONEON-NOSVE-NEXT: stp d15, d14, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp d13, d12, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp d11, d10, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp d9, d8, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 208 +; NONEON-NOSVE-NEXT: .cfi_offset b8, -8 +; NONEON-NOSVE-NEXT: .cfi_offset b9, -16 +; NONEON-NOSVE-NEXT: .cfi_offset b10, -24 +; NONEON-NOSVE-NEXT: .cfi_offset b11, -32 +; NONEON-NOSVE-NEXT: .cfi_offset b12, -40 +; NONEON-NOSVE-NEXT: .cfi_offset b13, -48 +; NONEON-NOSVE-NEXT: .cfi_offset b14, -56 +; NONEON-NOSVE-NEXT: .cfi_offset b15, -64 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q18, q19, [x2] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h24, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h25, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #78] +; NONEON-NOSVE-NEXT: str q19, [sp, #96] +; NONEON-NOSVE-NEXT: str q18, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h18, [sp, #110] +; NONEON-NOSVE-NEXT: ldr h15, [sp, #92] +; NONEON-NOSVE-NEXT: fcvt s20, h0 +; NONEON-NOSVE-NEXT: fcvt s21, h1 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: ldr h13, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h14, [sp, #74] +; NONEON-NOSVE-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h12, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h9, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h10, [sp, #70] +; NONEON-NOSVE-NEXT: fmul s30, s21, s20 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: ldr h31, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h28, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h29, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h26, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h27, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h22, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h23, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h20, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h21, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt h19, s30 +; NONEON-NOSVE-NEXT: fcvt s30, h15 +; NONEON-NOSVE-NEXT: ldr h16, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h17, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h6, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h7, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h5, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: fmul s0, s0, s30 +; NONEON-NOSVE-NEXT: fcvt s30, h14 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: fmul s16, s17, s16 +; NONEON-NOSVE-NEXT: fmul s6, s7, s6 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s18, s19, s18 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s19, h13 +; NONEON-NOSVE-NEXT: fmul s4, s5, s4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: ldp d15, d14, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fmul s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] // 2-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: str h18, [sp, #142] +; NONEON-NOSVE-NEXT: ldr h18, [sp, #108] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fmul s1, s1, s3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s18 +; NONEON-NOSVE-NEXT: fmul s18, s30, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h11 +; NONEON-NOSVE-NEXT: fcvt s30, h12 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: ldp d13, d12, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: str h0, [sp, #140] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #106] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s30, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h9 +; NONEON-NOSVE-NEXT: fcvt s30, h10 +; NONEON-NOSVE-NEXT: ldp d11, d10, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #138] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #104] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s30, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h31 +; NONEON-NOSVE-NEXT: fcvt s30, h8 +; NONEON-NOSVE-NEXT: ldp d9, d8, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #136] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #102] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s30, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h28 +; NONEON-NOSVE-NEXT: fcvt s28, h29 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #134] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #100] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s28, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h26 +; NONEON-NOSVE-NEXT: fcvt s26, h27 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #132] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #98] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s26, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h24 +; NONEON-NOSVE-NEXT: fcvt s24, h25 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #130] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #96] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s24, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #128] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s22, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h20 +; NONEON-NOSVE-NEXT: fcvt s20, h21 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #126] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s20, s19 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #124] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #122] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s16, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s6, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #118] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s4, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #116] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s2, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #114] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #112] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #208 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -144,8 +460,19 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o ; ; NONEON-NOSVE-LABEL: fma_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.2s, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <2 x float> %op1, %op2 %res = fadd contract <2 x float> %mul, %op3 @@ -165,8 +492,26 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o ; ; NONEON-NOSVE-LABEL: fma_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <4 x float> %op1, %op2 %res = fadd contract <4 x float> %mul, %op3 @@ -188,12 +533,45 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fmla v1.4s, v0.4s, v2.4s -; NONEON-NOSVE-NEXT: fmla v5.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #92] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #56] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #88] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #48] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #120] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #84] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #80] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #104] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -212,7 +590,12 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double ; ; NONEON-NOSVE-LABEL: fma_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmadd d0, d0, d1, d2 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <1 x double> %op1, %op2 %res = fadd contract <1 x double> %mul, %op3 @@ -232,8 +615,19 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double ; ; NONEON-NOSVE-LABEL: fma_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <2 x double> %op1, %op2 %res = fadd contract <2 x double> %mul, %op3 @@ -255,12 +649,31 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fmla v1.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: fmla v5.2d, v4.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp, #48] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #80] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp] +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #112] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll index a96adfec2ad105..775cac272cde9d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll @@ -21,34 +21,39 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fmaxnm s5, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: fmaxnm s3, s4, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h2, s3 -; NONEON-NOSVE-NEXT: fmaxnm s1, s4, s1 -; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res @@ -66,60 +71,66 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fmaxnm s3, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s4 -; NONEON-NOSVE-NEXT: fmaxnm s4, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fmaxnm s5, s5, s16 -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt s3, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fmaxnm s3, s6, s3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fmaxnm s6, s16, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res @@ -139,115 +150,127 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov h7, v0.h[1] -; NONEON-NOSVE-NEXT: mov h16, v0.h[2] -; NONEON-NOSVE-NEXT: mov h18, v2.h[1] -; NONEON-NOSVE-NEXT: mov h5, v1.h[1] -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h17, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: fcvt s20, h3 -; NONEON-NOSVE-NEXT: fcvt s21, h2 -; NONEON-NOSVE-NEXT: mov h22, v3.h[2] -; NONEON-NOSVE-NEXT: mov h23, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fmaxnm s4, s19, s4 -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: mov h24, v3.h[3] -; NONEON-NOSVE-NEXT: fmaxnm s20, s21, s20 -; NONEON-NOSVE-NEXT: fcvt s21, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov h23, v2.h[3] -; NONEON-NOSVE-NEXT: mov h25, v2.h[6] -; NONEON-NOSVE-NEXT: fmaxnm s5, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[3] -; NONEON-NOSVE-NEXT: fmaxnm s6, s16, s6 -; NONEON-NOSVE-NEXT: fmaxnm s16, s18, s17 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s18, h19 -; NONEON-NOSVE-NEXT: fcvt s19, h24 -; NONEON-NOSVE-NEXT: mov h24, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h17, s5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt h5, s20 -; NONEON-NOSVE-NEXT: fmaxnm s20, s22, s21 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s21, h23 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: mov h22, v0.h[4] -; NONEON-NOSVE-NEXT: mov h23, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: mov h17, v1.h[4] -; NONEON-NOSVE-NEXT: fmaxnm s7, s18, s7 -; NONEON-NOSVE-NEXT: mov h18, v3.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s20 -; NONEON-NOSVE-NEXT: fmaxnm s19, s21, s19 -; NONEON-NOSVE-NEXT: fcvt s20, h23 -; NONEON-NOSVE-NEXT: mov h21, v1.h[5] -; NONEON-NOSVE-NEXT: mov h23, v2.h[5] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: fcvt s6, h17 -; NONEON-NOSVE-NEXT: fcvt s17, h22 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov h22, v3.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s19 -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s6, s17, s6 -; NONEON-NOSVE-NEXT: mov h17, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fmaxnm s18, s20, s18 -; NONEON-NOSVE-NEXT: mov h20, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt s7, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt s16, h21 -; NONEON-NOSVE-NEXT: fcvt s21, h24 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: fmaxnm s7, s22, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmaxnm s16, s21, s16 -; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] -; NONEON-NOSVE-NEXT: fmaxnm s6, s19, s17 -; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fmaxnm s17, s23, s20 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s17 -; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -268,7 +291,17 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnm v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res @@ -286,7 +319,22 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res @@ -306,11 +354,39 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmaxnm v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmaxnm v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -327,7 +403,12 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmaxnm d0, d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res @@ -345,7 +426,16 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnm v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res @@ -365,11 +455,27 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmaxnm v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmaxnm v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fmaxnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -394,34 +500,39 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fminnm s5, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: fminnm s3, s4, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h2, s3 -; NONEON-NOSVE-NEXT: fminnm s1, s4, s1 -; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res @@ -439,60 +550,66 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fminnm s3, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s4 -; NONEON-NOSVE-NEXT: fminnm s4, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fminnm s5, s5, s16 -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt s3, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fminnm s3, s6, s3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fminnm s6, s16, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res @@ -512,115 +629,127 @@ define void @fminnm_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fminnm_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov h7, v0.h[1] -; NONEON-NOSVE-NEXT: mov h16, v0.h[2] -; NONEON-NOSVE-NEXT: mov h18, v2.h[1] -; NONEON-NOSVE-NEXT: mov h5, v1.h[1] -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h17, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: fcvt s20, h3 -; NONEON-NOSVE-NEXT: fcvt s21, h2 -; NONEON-NOSVE-NEXT: mov h22, v3.h[2] -; NONEON-NOSVE-NEXT: mov h23, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fminnm s4, s19, s4 -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: mov h24, v3.h[3] -; NONEON-NOSVE-NEXT: fminnm s20, s21, s20 -; NONEON-NOSVE-NEXT: fcvt s21, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov h23, v2.h[3] -; NONEON-NOSVE-NEXT: mov h25, v2.h[6] -; NONEON-NOSVE-NEXT: fminnm s5, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[3] -; NONEON-NOSVE-NEXT: fminnm s6, s16, s6 -; NONEON-NOSVE-NEXT: fminnm s16, s18, s17 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s18, h19 -; NONEON-NOSVE-NEXT: fcvt s19, h24 -; NONEON-NOSVE-NEXT: mov h24, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h17, s5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt h5, s20 -; NONEON-NOSVE-NEXT: fminnm s20, s22, s21 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s21, h23 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: mov h22, v0.h[4] -; NONEON-NOSVE-NEXT: mov h23, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: mov h17, v1.h[4] -; NONEON-NOSVE-NEXT: fminnm s7, s18, s7 -; NONEON-NOSVE-NEXT: mov h18, v3.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s20 -; NONEON-NOSVE-NEXT: fminnm s19, s21, s19 -; NONEON-NOSVE-NEXT: fcvt s20, h23 -; NONEON-NOSVE-NEXT: mov h21, v1.h[5] -; NONEON-NOSVE-NEXT: mov h23, v2.h[5] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: fcvt s6, h17 -; NONEON-NOSVE-NEXT: fcvt s17, h22 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov h22, v3.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s19 -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s6, s17, s6 -; NONEON-NOSVE-NEXT: mov h17, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fminnm s18, s20, s18 -; NONEON-NOSVE-NEXT: mov h20, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt s7, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt s16, h21 -; NONEON-NOSVE-NEXT: fcvt s21, h24 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: fminnm s7, s22, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fminnm s16, s21, s16 -; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] -; NONEON-NOSVE-NEXT: fminnm s6, s19, s17 -; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fminnm s17, s23, s20 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s17 -; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -641,7 +770,17 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnm v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res @@ -659,7 +798,22 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnm v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res @@ -679,11 +833,39 @@ define void @fminnm_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fminnm_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fminnm v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fminnm v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -700,7 +882,12 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fminnm d0, d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res @@ -718,7 +905,16 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnm v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res @@ -738,11 +934,27 @@ define void @fminnm_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fminnm_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fminnm v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fminnm v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fminnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -767,34 +979,39 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s2, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fmax s5, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: fmax s3, s4, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h2, s3 -; NONEON-NOSVE-NEXT: fmax s1, s4, s1 -; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res @@ -812,60 +1029,66 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fmax s3, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s4 -; NONEON-NOSVE-NEXT: fmax s4, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fmax s5, s5, s16 -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt s3, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fmax s3, s6, s3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fmax s6, s16, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fmax s0, s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res @@ -885,115 +1108,127 @@ define void @fmax_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmax_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov h7, v0.h[1] -; NONEON-NOSVE-NEXT: mov h16, v0.h[2] -; NONEON-NOSVE-NEXT: mov h18, v2.h[1] -; NONEON-NOSVE-NEXT: mov h5, v1.h[1] -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h17, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: fcvt s20, h3 -; NONEON-NOSVE-NEXT: fcvt s21, h2 -; NONEON-NOSVE-NEXT: mov h22, v3.h[2] -; NONEON-NOSVE-NEXT: mov h23, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fmax s4, s19, s4 -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: mov h24, v3.h[3] -; NONEON-NOSVE-NEXT: fmax s20, s21, s20 -; NONEON-NOSVE-NEXT: fcvt s21, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov h23, v2.h[3] -; NONEON-NOSVE-NEXT: mov h25, v2.h[6] -; NONEON-NOSVE-NEXT: fmax s5, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[3] -; NONEON-NOSVE-NEXT: fmax s6, s16, s6 -; NONEON-NOSVE-NEXT: fmax s16, s18, s17 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s18, h19 -; NONEON-NOSVE-NEXT: fcvt s19, h24 -; NONEON-NOSVE-NEXT: mov h24, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h17, s5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt h5, s20 -; NONEON-NOSVE-NEXT: fmax s20, s22, s21 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s21, h23 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: mov h22, v0.h[4] -; NONEON-NOSVE-NEXT: mov h23, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: mov h17, v1.h[4] -; NONEON-NOSVE-NEXT: fmax s7, s18, s7 -; NONEON-NOSVE-NEXT: mov h18, v3.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s20 -; NONEON-NOSVE-NEXT: fmax s19, s21, s19 -; NONEON-NOSVE-NEXT: fcvt s20, h23 -; NONEON-NOSVE-NEXT: mov h21, v1.h[5] -; NONEON-NOSVE-NEXT: mov h23, v2.h[5] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: fcvt s6, h17 -; NONEON-NOSVE-NEXT: fcvt s17, h22 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov h22, v3.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s19 -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s6, s17, s6 -; NONEON-NOSVE-NEXT: mov h17, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fmax s18, s20, s18 -; NONEON-NOSVE-NEXT: mov h20, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt s7, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt s16, h21 -; NONEON-NOSVE-NEXT: fcvt s21, h24 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: fmax s7, s22, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmax s16, s21, s16 -; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] -; NONEON-NOSVE-NEXT: fmax s6, s19, s17 -; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fmax s17, s23, s20 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fmax s0, s0, s1 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s17 -; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1014,7 +1249,17 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmax v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res @@ -1032,7 +1277,22 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmax v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res @@ -1052,11 +1312,39 @@ define void @fmax_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmax_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmax v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -1073,7 +1361,12 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmax d0, d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res @@ -1091,7 +1384,16 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmax v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmax d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res @@ -1111,11 +1413,27 @@ define void @fmax_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmax_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmax v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmax v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fmax d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmax d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -1140,34 +1458,39 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s2, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fmin s5, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: fmin s3, s4, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h2, s3 -; NONEON-NOSVE-NEXT: fmin s1, s4, s1 -; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res @@ -1185,60 +1508,66 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fmin s3, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s4 -; NONEON-NOSVE-NEXT: fmin s4, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fmin s5, s5, s16 -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt s3, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fmin s3, s6, s3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fmin s6, s16, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fmin s0, s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res @@ -1258,115 +1587,127 @@ define void @fmin_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmin_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov h7, v0.h[1] -; NONEON-NOSVE-NEXT: mov h16, v0.h[2] -; NONEON-NOSVE-NEXT: mov h18, v2.h[1] -; NONEON-NOSVE-NEXT: mov h5, v1.h[1] -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h17, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: fcvt s20, h3 -; NONEON-NOSVE-NEXT: fcvt s21, h2 -; NONEON-NOSVE-NEXT: mov h22, v3.h[2] -; NONEON-NOSVE-NEXT: mov h23, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fmin s4, s19, s4 -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: mov h24, v3.h[3] -; NONEON-NOSVE-NEXT: fmin s20, s21, s20 -; NONEON-NOSVE-NEXT: fcvt s21, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov h23, v2.h[3] -; NONEON-NOSVE-NEXT: mov h25, v2.h[6] -; NONEON-NOSVE-NEXT: fmin s5, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[3] -; NONEON-NOSVE-NEXT: fmin s6, s16, s6 -; NONEON-NOSVE-NEXT: fmin s16, s18, s17 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s18, h19 -; NONEON-NOSVE-NEXT: fcvt s19, h24 -; NONEON-NOSVE-NEXT: mov h24, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h17, s5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt h5, s20 -; NONEON-NOSVE-NEXT: fmin s20, s22, s21 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s21, h23 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: mov h22, v0.h[4] -; NONEON-NOSVE-NEXT: mov h23, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: mov h17, v1.h[4] -; NONEON-NOSVE-NEXT: fmin s7, s18, s7 -; NONEON-NOSVE-NEXT: mov h18, v3.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s20 -; NONEON-NOSVE-NEXT: fmin s19, s21, s19 -; NONEON-NOSVE-NEXT: fcvt s20, h23 -; NONEON-NOSVE-NEXT: mov h21, v1.h[5] -; NONEON-NOSVE-NEXT: mov h23, v2.h[5] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: fcvt s6, h17 -; NONEON-NOSVE-NEXT: fcvt s17, h22 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov h22, v3.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s19 -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s6, s17, s6 -; NONEON-NOSVE-NEXT: mov h17, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fmin s18, s20, s18 -; NONEON-NOSVE-NEXT: mov h20, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt s7, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt s16, h21 -; NONEON-NOSVE-NEXT: fcvt s21, h24 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: fmin s7, s22, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmin s16, s21, s16 -; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] -; NONEON-NOSVE-NEXT: fmin s6, s19, s17 -; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fmin s17, s23, s20 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fmin s0, s0, s1 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s17 -; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1387,7 +1728,17 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmin v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res @@ -1405,7 +1756,22 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmin v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res @@ -1425,11 +1791,39 @@ define void @fmin_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmin_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmin v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -1446,7 +1840,12 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmin d0, d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res @@ -1464,7 +1863,16 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmin v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmin d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res @@ -1484,11 +1892,27 @@ define void @fmin_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmin_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmin v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmin v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fmin d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmin d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll index f1561011e21812..f081d4ac65b279 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll @@ -30,26 +30,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll index a0a7dad835662e..4eaaee7ce5055d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll @@ -23,26 +23,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res @@ -71,45 +75,49 @@ define half @fadda_v8f16(half %start, <8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v8f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h1, [sp] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[4] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[5] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) ret half %res @@ -154,86 +162,93 @@ define half @fadda_v16f16(half %start, ptr %a) { ; ; NONEON-NOSVE-LABEL: fadda_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[4] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[5] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[4] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[5] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) @@ -251,10 +266,13 @@ define float @fadda_v2f32(float %start, <2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov s2, v1.s[1] -; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] ; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) ret float %res @@ -275,13 +293,15 @@ define float @fadda_v4f32(float %start, <4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov s2, v1.s[1] +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp] +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: mov s3, v1.s[2] -; NONEON-NOSVE-NEXT: mov s1, v1.s[3] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] ; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: fadd s0, s0, s3 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) ret float %res @@ -310,22 +330,25 @@ define float @fadda_v8f32(float %start, ptr %a) { ; ; NONEON-NOSVE-LABEL: fadda_v8f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: mov s2, v1.s[1] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp] +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: mov s3, v1.s[2] -; NONEON-NOSVE-NEXT: mov s1, v1.s[3] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] ; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: fadd s0, s0, s3 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: mov s2, v1.s[1] -; NONEON-NOSVE-NEXT: mov s3, v1.s[2] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: mov s1, v1.s[3] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #24] ; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: fadd s0, s0, s3 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) @@ -357,9 +380,11 @@ define double @fadda_v2f64(double %start, <2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov d2, v1.d[1] -; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp], #16 ; NONEON-NOSVE-NEXT: fadd d0, d0, d2 +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) ret double %res @@ -380,13 +405,19 @@ define double @fadda_v4f64(double %start, ptr %a) { ; ; NONEON-NOSVE-LABEL: fadda_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q3, q1, [x0] -; NONEON-NOSVE-NEXT: mov d2, v3.d[1] -; NONEON-NOSVE-NEXT: fadd d0, d0, d3 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp] ; NONEON-NOSVE-NEXT: fadd d0, d0, d2 -; NONEON-NOSVE-NEXT: mov d2, v1.d[1] ; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #16] ; NONEON-NOSVE-NEXT: fadd d0, d0, d2 +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) @@ -408,26 +439,30 @@ define half @faddv_v4f16(half %start, <4 x half> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 ; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res @@ -444,45 +479,49 @@ define half @faddv_v8f16(half %start, <8 x half> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 ; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) ret half %res @@ -500,54 +539,90 @@ define half @faddv_v16f16(half %start, ptr %a) { ; ; NONEON-NOSVE-LABEL: faddv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fadd v3.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v1.4s -; NONEON-NOSVE-NEXT: mov h1, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s3, h2 +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h4, [sp] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s3, s1 -; NONEON-NOSVE-NEXT: mov h3, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 -; NONEON-NOSVE-NEXT: mov h3, v2.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fadd s2, s4, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h4, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 -; NONEON-NOSVE-NEXT: mov h3, v2.h[4] +; NONEON-NOSVE-NEXT: fcvt s4, h4 ; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 -; NONEON-NOSVE-NEXT: mov h3, v2.h[5] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fadd s2, s4, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h4, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fadd s2, s4, s3 ; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h2, s2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fadd s1, s1, s2 ; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) @@ -565,8 +640,13 @@ define float @faddv_v2f32(float %start, <2 x float> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: faddp s1, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) ret float %res @@ -583,8 +663,13 @@ define float @faddv_v4f32(float %start, <4 x float> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: faddp v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: faddp s1, v1.2s +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s4, s3, [sp], #16 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fadd s1, s3, s1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: ret %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) @@ -604,10 +689,21 @@ define float @faddv_v8f32(float %start, ptr %a) { ; NONEON-NOSVE-LABEL: faddv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] -; NONEON-NOSVE-NEXT: fadd v1.4s, v2.4s, v1.4s -; NONEON-NOSVE-NEXT: faddp v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: faddp s1, v1.2s +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s4, s3, [sp] +; NONEON-NOSVE-NEXT: ldp s5, s6, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s7, s16, [sp, #8] +; NONEON-NOSVE-NEXT: fadd s1, s3, s1 +; NONEON-NOSVE-NEXT: fadd s2, s4, s2 +; NONEON-NOSVE-NEXT: fadd s3, s7, s5 +; NONEON-NOSVE-NEXT: fadd s4, s16, s6 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fadd s2, s3, s4 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) @@ -639,7 +735,10 @@ define double @faddv_v2f64(double %start, <2 x double> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: faddp d1, v1.2d +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp], #16 +; NONEON-NOSVE-NEXT: fadd d1, d2, d1 ; NONEON-NOSVE-NEXT: fadd d0, d0, d1 ; NONEON-NOSVE-NEXT: ret %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) @@ -659,8 +758,13 @@ define double @faddv_v4f64(double %start, ptr %a) { ; NONEON-NOSVE-LABEL: faddv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] -; NONEON-NOSVE-NEXT: fadd v1.2d, v2.2d, v1.2d -; NONEON-NOSVE-NEXT: faddp d1, v1.2d +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d4, d3, [sp], #32 +; NONEON-NOSVE-NEXT: fadd d1, d3, d1 +; NONEON-NOSVE-NEXT: fadd d2, d4, d2 +; NONEON-NOSVE-NEXT: fadd d1, d2, d1 ; NONEON-NOSVE-NEXT: fadd d0, d0, d1 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a @@ -683,22 +787,26 @@ define half @fmaxv_v4f16(<4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a) ret half %res @@ -715,41 +823,45 @@ define half @fmaxv_v8f16(<8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a) ret half %res @@ -767,81 +879,86 @@ define half @fmaxv_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fmaxnm s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fmaxnm s2, s4, s2 -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #26] ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fmaxnm s3, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s3, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op) @@ -859,7 +976,12 @@ define float @fmaxv_v2f32(<2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnmp s0, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a) ret float %res @@ -876,7 +998,14 @@ define float @fmaxv_v4f32(<4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnmv s0, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s2 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) ret float %res @@ -895,8 +1024,20 @@ define float @fmaxv_v8f32(ptr %a) { ; NONEON-NOSVE-LABEL: fmaxv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmaxnm v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmaxnmv s0, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s3, s2, [sp] +; NONEON-NOSVE-NEXT: fmaxnm s0, s2, s0 +; NONEON-NOSVE-NEXT: fmaxnm s1, s3, s1 +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s3, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fmaxnm s1, s4, s1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s2 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op) @@ -926,7 +1067,10 @@ define double @fmaxv_v2f64(<2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnmp d0, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp], #16 +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) ret double %res @@ -945,8 +1089,13 @@ define double @fmaxv_v4f64(ptr %a) { ; NONEON-NOSVE-LABEL: fmaxv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmaxnm v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmaxnmp d0, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp], #32 +; NONEON-NOSVE-NEXT: fmaxnm d0, d2, d0 +; NONEON-NOSVE-NEXT: fmaxnm d1, d3, d1 +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op) @@ -968,22 +1117,26 @@ define half @fminv_v4f16(<4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a) ret half %res @@ -1000,41 +1153,45 @@ define half @fminv_v8f16(<8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a) ret half %res @@ -1052,81 +1209,86 @@ define half @fminv_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fminv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fminnm s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fminnm s2, s4, s2 -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #26] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fminnm s2, s2, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fminnm s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fminnm s3, s5, s4 +; NONEON-NOSVE-NEXT: fminnm s1, s3, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op) @@ -1144,7 +1306,12 @@ define float @fminv_v2f32(<2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnmp s0, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a) ret float %res @@ -1161,7 +1328,14 @@ define float @fminv_v4f32(<4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnmv s0, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fminnm s0, s0, s2 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) ret float %res @@ -1180,8 +1354,20 @@ define float @fminv_v8f32(ptr %a) { ; NONEON-NOSVE-LABEL: fminv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fminnm v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fminnmv s0, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s3, s2, [sp] +; NONEON-NOSVE-NEXT: fminnm s0, s2, s0 +; NONEON-NOSVE-NEXT: fminnm s1, s3, s1 +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s3, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fminnm s1, s4, s1 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s2 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op) @@ -1211,7 +1397,10 @@ define double @fminv_v2f64(<2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnmp d0, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp], #16 +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) ret double %res @@ -1230,8 +1419,13 @@ define double @fminv_v4f64(ptr %a) { ; NONEON-NOSVE-LABEL: fminv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fminnm v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fminnmp d0, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp], #32 +; NONEON-NOSVE-NEXT: fminnm d0, d2, d0 +; NONEON-NOSVE-NEXT: fminnm d1, d3, d1 +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op) @@ -1253,22 +1447,26 @@ define half @fmaximumv_v4f16(<4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a) ret half %res @@ -1285,41 +1483,45 @@ define half @fmaximumv_v8f16(<8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a) ret half %res @@ -1337,81 +1539,86 @@ define half @fmaximumv_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fmax s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fmax s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fmax s2, s4, s2 -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #26] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fmax s2, s2, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fmax s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fmax s0, s0, s1 -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: fmax s3, s5, s4 +; NONEON-NOSVE-NEXT: fmax s1, s3, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op) @@ -1429,7 +1636,12 @@ define float @fmaximumv_v2f32(<2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxp s0, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a) ret float %res @@ -1446,7 +1658,14 @@ define float @fmaximumv_v4f32(<4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxv s0, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fmax s0, s0, s2 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a) ret float %res @@ -1465,8 +1684,20 @@ define float @fmaximumv_v8f32(ptr %a) { ; NONEON-NOSVE-LABEL: fmaximumv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmaxv s0, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s3, s2, [sp] +; NONEON-NOSVE-NEXT: fmax s0, s2, s0 +; NONEON-NOSVE-NEXT: fmax s1, s3, s1 +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s3, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fmax s2, s2, s3 +; NONEON-NOSVE-NEXT: fmax s1, s4, s1 +; NONEON-NOSVE-NEXT: fmax s0, s0, s2 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op) @@ -1496,7 +1727,10 @@ define double @fmaximumv_v2f64(<2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxp d0, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp], #16 +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a) ret double %res @@ -1515,8 +1749,13 @@ define double @fmaximumv_v4f64(ptr %a) { ; NONEON-NOSVE-LABEL: fmaximumv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmax v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmaxp d0, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp], #32 +; NONEON-NOSVE-NEXT: fmax d0, d2, d0 +; NONEON-NOSVE-NEXT: fmax d1, d3, d1 +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op) @@ -1538,22 +1777,26 @@ define half @fminimumv_v4f16(<4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a) ret half %res @@ -1570,41 +1813,45 @@ define half @fminimumv_v8f16(<8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a) ret half %res @@ -1622,81 +1869,86 @@ define half @fminimumv_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fmin s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fmin s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fmin s2, s4, s2 -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #26] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fmin s2, s2, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fmin s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fmin s0, s0, s1 -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: fmin s3, s5, s4 +; NONEON-NOSVE-NEXT: fmin s1, s3, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op) @@ -1714,7 +1966,12 @@ define float @fminimumv_v2f32(<2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminp s0, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a) ret float %res @@ -1731,7 +1988,14 @@ define float @fminimumv_v4f32(<4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminv s0, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fmin s0, s0, s2 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a) ret float %res @@ -1750,8 +2014,20 @@ define float @fminimumv_v8f32(ptr %a) { ; NONEON-NOSVE-LABEL: fminimumv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fminv s0, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s3, s2, [sp] +; NONEON-NOSVE-NEXT: fmin s0, s2, s0 +; NONEON-NOSVE-NEXT: fmin s1, s3, s1 +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s3, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fmin s2, s2, s3 +; NONEON-NOSVE-NEXT: fmin s1, s4, s1 +; NONEON-NOSVE-NEXT: fmin s0, s0, s2 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op) @@ -1781,7 +2057,10 @@ define double @fminimumv_v2f64(<2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminp d0, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp], #16 +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a) ret double %res @@ -1800,8 +2079,13 @@ define double @fminimumv_v4f64(ptr %a) { ; NONEON-NOSVE-LABEL: fminimumv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmin v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fminp d0, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp], #32 +; NONEON-NOSVE-NEXT: fmin d0, d2, d0 +; NONEON-NOSVE-NEXT: fmin d1, d3, d1 +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll index 6af2b885ace08f..344aac5b198384 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll @@ -20,9 +20,30 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.ceil.v2f16(<2 x half> %op) ret <2 x half> %res @@ -39,9 +60,30 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op) ret <4 x half> %res @@ -58,12 +100,50 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintp v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintp v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op) ret <8 x half> %res @@ -81,20 +161,92 @@ define void @frintp_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintp_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintp v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintp v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintp v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op) @@ -113,7 +265,15 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintp v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op) ret <2 x float> %res @@ -130,7 +290,20 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op) ret <4 x float> %res @@ -148,10 +321,32 @@ define void @frintp_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintp_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintp v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op) @@ -167,7 +362,12 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintp d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op) ret <1 x double> %res @@ -184,7 +384,15 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintp v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintp d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintp d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op) ret <2 x double> %res @@ -202,10 +410,22 @@ define void @frintp_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintp_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintp v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintp v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintp d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintp d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintp d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintp d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op) @@ -228,9 +448,30 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.floor.v2f16(<2 x half> %op) ret <2 x half> %res @@ -247,9 +488,30 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op) ret <4 x half> %res @@ -266,12 +528,50 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintm v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintm v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op) ret <8 x half> %res @@ -289,20 +589,92 @@ define void @frintm_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintm_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintm v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintm v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintm v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op) @@ -321,7 +693,15 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintm v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op) ret <2 x float> %res @@ -338,7 +718,20 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op) ret <4 x float> %res @@ -356,10 +749,32 @@ define void @frintm_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintm_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintm v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op) @@ -375,7 +790,12 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintm d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op) ret <1 x double> %res @@ -392,7 +812,15 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintm v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintm d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintm d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op) ret <2 x double> %res @@ -410,10 +838,22 @@ define void @frintm_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintm_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintm v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintm v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintm d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintm d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintm d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintm d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op) @@ -436,9 +876,30 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.nearbyint.v2f16(<2 x half> %op) ret <2 x half> %res @@ -455,9 +916,30 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op) ret <4 x half> %res @@ -474,12 +956,50 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frinti v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frinti v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op) ret <8 x half> %res @@ -497,20 +1017,92 @@ define void @frinti_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinti_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frinti v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frinti v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frinti v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op) @@ -529,7 +1121,15 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinti v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op) ret <2 x float> %res @@ -546,7 +1146,20 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op) ret <4 x float> %res @@ -564,10 +1177,32 @@ define void @frinti_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinti_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frinti v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op) @@ -583,7 +1218,12 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frinti d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op) ret <1 x double> %res @@ -600,7 +1240,15 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinti v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frinti d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frinti d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op) ret <2 x double> %res @@ -618,10 +1266,22 @@ define void @frinti_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinti_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frinti v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frinti v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frinti d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frinti d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frinti d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frinti d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op) @@ -644,9 +1304,30 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.rint.v2f16(<2 x half> %op) ret <2 x half> %res @@ -663,9 +1344,30 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op) ret <4 x half> %res @@ -682,12 +1384,50 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintx v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintx v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op) ret <8 x half> %res @@ -705,20 +1445,92 @@ define void @frintx_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintx_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintx v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintx v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintx v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op) @@ -737,7 +1549,15 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintx v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op) ret <2 x float> %res @@ -754,7 +1574,20 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op) ret <4 x float> %res @@ -772,10 +1605,32 @@ define void @frintx_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintx_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintx v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op) @@ -791,7 +1646,12 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintx d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op) ret <1 x double> %res @@ -808,7 +1668,15 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintx v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintx d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintx d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op) ret <2 x double> %res @@ -826,10 +1694,22 @@ define void @frintx_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintx_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintx v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintx v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintx d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintx d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintx d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintx d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op) @@ -852,9 +1732,30 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.round.v2f16(<2 x half> %op) ret <2 x half> %res @@ -871,9 +1772,30 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op) ret <4 x half> %res @@ -890,12 +1812,50 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frinta v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frinta v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op) ret <8 x half> %res @@ -913,20 +1873,92 @@ define void @frinta_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinta_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frinta v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frinta v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frinta v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op) @@ -945,7 +1977,15 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinta v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op) ret <2 x float> %res @@ -962,7 +2002,20 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op) ret <4 x float> %res @@ -980,10 +2033,32 @@ define void @frinta_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinta_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frinta v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op) @@ -999,7 +2074,12 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frinta d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op) ret <1 x double> %res @@ -1016,7 +2096,15 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinta v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frinta d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frinta d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op) ret <2 x double> %res @@ -1034,10 +2122,22 @@ define void @frinta_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinta_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frinta v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frinta v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frinta d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frinta d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frinta d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frinta d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op) @@ -1060,9 +2160,30 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %op) ret <2 x half> %res @@ -1079,9 +2200,30 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op) ret <4 x half> %res @@ -1098,12 +2240,50 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintn v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintn v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op) ret <8 x half> %res @@ -1121,20 +2301,92 @@ define void @frintn_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintn_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintn v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintn v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintn v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op) @@ -1153,7 +2405,15 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintn v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op) ret <2 x float> %res @@ -1170,7 +2430,20 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op) ret <4 x float> %res @@ -1188,10 +2461,32 @@ define void @frintn_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintn_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintn v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op) @@ -1207,7 +2502,12 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintn d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op) ret <1 x double> %res @@ -1224,7 +2524,15 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintn v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintn d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintn d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op) ret <2 x double> %res @@ -1242,10 +2550,22 @@ define void @frintn_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintn_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintn v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintn v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintn d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintn d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintn d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintn d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op) @@ -1268,9 +2588,30 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.trunc.v2f16(<2 x half> %op) ret <2 x half> %res @@ -1287,9 +2628,30 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op) ret <4 x half> %res @@ -1306,12 +2668,50 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintz v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintz v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op) ret <8 x half> %res @@ -1329,20 +2729,92 @@ define void @frintz_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintz_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintz v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintz v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintz v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op) @@ -1361,7 +2833,15 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintz v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op) ret <2 x float> %res @@ -1378,7 +2858,20 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op) ret <4 x float> %res @@ -1396,10 +2889,32 @@ define void @frintz_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintz_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintz v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op) @@ -1415,7 +2930,12 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op) ret <1 x double> %res @@ -1432,7 +2952,15 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintz v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintz d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op) ret <2 x double> %res @@ -1450,10 +2978,22 @@ define void @frintz_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintz_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintz v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintz v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintz d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintz d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll index 824419b31a5a83..daa9b51cc827b3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll @@ -20,10 +20,28 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2 ret <2 x half> %sel @@ -44,10 +62,28 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2 ret <4 x half> %sel @@ -68,10 +104,43 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.8h, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2 ret <8 x half> %sel @@ -95,16 +164,83 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v16f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #62] +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #60] +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #58] +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #56] +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #54] +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #52] +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #50] +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #48] +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <16 x half>, ptr %a %op2 = load volatile <16 x half>, ptr %b @@ -128,10 +264,18 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2 ret <2 x float> %sel @@ -152,10 +296,23 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4s, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2 ret <4 x float> %sel @@ -179,16 +336,43 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s3, s0, s2, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s3, s0, s2, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #56] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcsel s3, s0, s2, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #48] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fcsel s3, s0, s2, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <8 x float>, ptr %a %op2 = load volatile <8 x float>, ptr %b @@ -206,10 +390,13 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask ; ; NONEON-NOSVE-LABEL: select_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: fmov d2, x8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fcsel d0, d0, d1, ne +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2 ret <1 x double> %sel @@ -231,10 +418,17 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask ; ; NONEON-NOSVE-LABEL: select_v2f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: dup v2.2d, x8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel d3, d2, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2 ret <2 x double> %sel @@ -259,16 +453,31 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel d3, d0, d2, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel d0, d0, d1, ne +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel d3, d0, d2, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcsel d0, d0, d1, ne +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <4 x double>, ptr %a %op2 = load volatile <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index c853bdc5af8db0..0d92a6fa0fa28d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -19,9 +19,26 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x half> %op1 to <4 x i16> ret <4 x i16> %res @@ -39,16 +56,43 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptoui <8 x half> %op1 to <8 x i16> @@ -69,22 +113,75 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v2.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtzu v3.4s, v3.4s -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i16> @@ -108,9 +205,17 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x half> %op1 to <2 x i32> ret <2 x i32> %res @@ -128,8 +233,25 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x half> %op1 to <4 x i32> ret <4 x i32> %res @@ -151,15 +273,41 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptoui <8 x half> %op1 to <8 x i32> @@ -189,21 +337,73 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v2.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtzu v3.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i32> @@ -224,9 +424,13 @@ define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzu x8, s0 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x half> %op1 to <1 x i64> ret <1 x i64> %res @@ -246,14 +450,18 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvtzu x8, s0 -; NONEON-NOSVE-NEXT: fcvtzu x9, s1 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x half> %op1 to <2 x i64> ret <2 x i64> %res @@ -280,23 +488,27 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: mov h1, v0.h[2] -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvtzu x9, s0 -; NONEON-NOSVE-NEXT: fcvtzu x8, s1 -; NONEON-NOSVE-NEXT: fcvtzu x10, s2 -; NONEON-NOSVE-NEXT: fcvtzu x11, s3 -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #16] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fptoui <4 x half> %op1 to <4 x i64> @@ -339,42 +551,43 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: mov h1, v0.h[2] -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: mov h4, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov h5, v2.h[2] -; NONEON-NOSVE-NEXT: mov h6, v2.h[3] -; NONEON-NOSVE-NEXT: mov h7, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: str q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzu x9, s0 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvtzu x13, s2 -; NONEON-NOSVE-NEXT: fcvtzu x8, s1 -; NONEON-NOSVE-NEXT: fcvt s1, h7 -; NONEON-NOSVE-NEXT: fcvtzu x10, s3 -; NONEON-NOSVE-NEXT: fcvtzu x11, s4 -; NONEON-NOSVE-NEXT: fcvtzu x12, s5 -; NONEON-NOSVE-NEXT: fcvtzu x14, s6 -; NONEON-NOSVE-NEXT: fmov d3, x13 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: fcvtzu x8, s1 -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d2, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v3.d[1], x8 -; NONEON-NOSVE-NEXT: mov v2.d[1], x14 -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] -; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptoui <8 x half> %op1 to <8 x i64> @@ -439,76 +652,79 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s3, h1 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h0 -; NONEON-NOSVE-NEXT: mov h0, v0.h[1] -; NONEON-NOSVE-NEXT: mov h1, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s17, h4 -; NONEON-NOSVE-NEXT: mov h18, v4.h[2] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvtzu x8, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: mov h16, v4.h[3] -; NONEON-NOSVE-NEXT: fcvtzu x9, s6 -; NONEON-NOSVE-NEXT: ldr d6, [sp, #8] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: mov h4, v4.h[1] -; NONEON-NOSVE-NEXT: fcvtzu x11, s2 -; NONEON-NOSVE-NEXT: mov h2, v6.h[2] -; NONEON-NOSVE-NEXT: fcvtzu x10, s17 -; NONEON-NOSVE-NEXT: fcvtzu x13, s5 -; NONEON-NOSVE-NEXT: fcvtzu x12, s3 -; NONEON-NOSVE-NEXT: mov h3, v6.h[3] -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov h5, v6.h[1] -; NONEON-NOSVE-NEXT: fcvt s17, h18 -; NONEON-NOSVE-NEXT: fcvtzu x14, s7 -; NONEON-NOSVE-NEXT: fmov d7, x8 +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 192 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzu x8, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fmov d0, x11 -; NONEON-NOSVE-NEXT: fcvtzu x11, s1 -; NONEON-NOSVE-NEXT: fmov d1, x13 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvtzu x13, s16 -; NONEON-NOSVE-NEXT: fmov d16, x9 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvtzu x15, s17 -; NONEON-NOSVE-NEXT: mov v0.d[1], x12 -; NONEON-NOSVE-NEXT: mov v1.d[1], x14 -; NONEON-NOSVE-NEXT: fcvtzu x9, s2 -; NONEON-NOSVE-NEXT: mov v16.d[1], x8 -; NONEON-NOSVE-NEXT: fcvtzu x8, s6 -; NONEON-NOSVE-NEXT: fcvtzu x14, s4 -; NONEON-NOSVE-NEXT: fcvtzu x12, s3 -; NONEON-NOSVE-NEXT: mov v7.d[1], x11 -; NONEON-NOSVE-NEXT: fmov d3, x10 -; NONEON-NOSVE-NEXT: fcvtzu x11, s5 -; NONEON-NOSVE-NEXT: fmov d2, x15 -; NONEON-NOSVE-NEXT: stp q16, q1, [x1, #64] -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d4, x8 -; NONEON-NOSVE-NEXT: stp q7, q0, [x1] -; NONEON-NOSVE-NEXT: mov v2.d[1], x13 -; NONEON-NOSVE-NEXT: mov v3.d[1], x14 -; NONEON-NOSVE-NEXT: mov v1.d[1], x12 -; NONEON-NOSVE-NEXT: mov v4.d[1], x11 -; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #96] -; NONEON-NOSVE-NEXT: stp q4, q1, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #128] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i64> @@ -531,7 +747,14 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i16> ret <2 x i16> %res @@ -549,8 +772,20 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x float> %op1 to <4 x i16> ret <4 x i16> %res @@ -572,10 +807,31 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i16> @@ -604,15 +860,56 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f32_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtzu v2.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x float>, ptr %a %res = fptoui <16 x float> %op1 to <16 x i16> @@ -635,7 +932,14 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i32> ret <2 x i32> %res @@ -652,7 +956,18 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x float> %op1 to <4 x i32> ret <4 x i32> %res @@ -670,10 +985,28 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i32> @@ -697,9 +1030,13 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f32_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x float> %op1 to <1 x i64> ret <1 x i64> %res @@ -717,8 +1054,15 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i64> ret <2 x i64> %res @@ -740,15 +1084,21 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fptoui <4 x float> %op1 to <4 x i64> @@ -778,21 +1128,33 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzu v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzu v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i64> @@ -814,8 +1176,12 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvtzs w8, d0 -; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i16> ret <1 x i16> %res @@ -833,8 +1199,14 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i16> ret <2 x i16> %res @@ -867,11 +1239,27 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #78] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w9, [sp, #74] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i16> @@ -919,19 +1307,49 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] -; NONEON-NOSVE-NEXT: adrp x8, .LCPI26_0 -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: xtn v7.2s, v0.2d -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI26_0] -; NONEON-NOSVE-NEXT: xtn v6.2s, v1.2d -; NONEON-NOSVE-NEXT: xtn v5.2s, v2.2d -; NONEON-NOSVE-NEXT: xtn v4.2s, v3.2d -; NONEON-NOSVE-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: str d2, [sp, #120] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #142] +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: str d0, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #138] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: strh w9, [sp, #134] +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #128] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptoui <8 x double> %op1 to <8 x i16> @@ -1012,31 +1430,90 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] -; NONEON-NOSVE-NEXT: adrp x8, .LCPI27_0 -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v5.2d, v5.2d -; NONEON-NOSVE-NEXT: fcvtzs v4.2d, v4.2d -; NONEON-NOSVE-NEXT: fcvtzs v6.2d, v6.2d -; NONEON-NOSVE-NEXT: fcvtzs v7.2d, v7.2d -; NONEON-NOSVE-NEXT: xtn v19.2s, v0.2d -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI27_0] -; NONEON-NOSVE-NEXT: xtn v23.2s, v3.2d -; NONEON-NOSVE-NEXT: xtn v18.2s, v1.2d -; NONEON-NOSVE-NEXT: xtn v22.2s, v2.2d -; NONEON-NOSVE-NEXT: xtn v17.2s, v5.2d -; NONEON-NOSVE-NEXT: xtn v21.2s, v6.2d -; NONEON-NOSVE-NEXT: xtn v16.2s, v4.2d -; NONEON-NOSVE-NEXT: xtn v20.2s, v7.2d -; NONEON-NOSVE-NEXT: tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b -; NONEON-NOSVE-NEXT: tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: sub sp, sp, #304 +; NONEON-NOSVE-NEXT: str x29, [sp, #288] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 304 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q6, q7, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #96] +; NONEON-NOSVE-NEXT: stp q1, q7, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q6, q4, [sp] +; NONEON-NOSVE-NEXT: stp q5, q3, [sp, #32] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #168] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #232] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #232] +; NONEON-NOSVE-NEXT: str d2, [sp, #248] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #144] +; NONEON-NOSVE-NEXT: strh w9, [sp, #270] +; NONEON-NOSVE-NEXT: strh w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #200] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w9, [sp, #266] +; NONEON-NOSVE-NEXT: strh w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #216] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #192] +; NONEON-NOSVE-NEXT: strh w9, [sp, #262] +; NONEON-NOSVE-NEXT: strh w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #200] +; NONEON-NOSVE-NEXT: str d0, [sp, #296] +; NONEON-NOSVE-NEXT: strh w9, [sp, #258] +; NONEON-NOSVE-NEXT: strh w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #216] +; NONEON-NOSVE-NEXT: strh w9, [sp, #286] +; NONEON-NOSVE-NEXT: strh w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #208] +; NONEON-NOSVE-NEXT: strh w9, [sp, #282] +; NONEON-NOSVE-NEXT: strh w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #300] +; NONEON-NOSVE-NEXT: strh w9, [sp, #278] +; NONEON-NOSVE-NEXT: strh w8, [sp, #274] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #296] +; NONEON-NOSVE-NEXT: strh w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #256] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #304 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptoui <16 x double> %op1 to <16 x i16> @@ -1060,9 +1537,12 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i32> ret <1 x i32> %res @@ -1080,8 +1560,14 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i32> ret <2 x i32> %res @@ -1103,10 +1589,19 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i32> @@ -1135,15 +1630,32 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzu v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtzu v2.2d, v2.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptoui <8 x double> %op1 to <8 x i32> @@ -1166,8 +1678,12 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvtzu x8, d0 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i64> ret <1 x i64> %res @@ -1184,7 +1700,14 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu x9, d1 +; NONEON-NOSVE-NEXT: fcvtzu x8, d0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i64> ret <2 x i64> %res @@ -1202,10 +1725,20 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzu x9, d1 +; NONEON-NOSVE-NEXT: fcvtzu x8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu x9, d1 +; NONEON-NOSVE-NEXT: fcvtzu x8, d0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i64> @@ -1228,9 +1761,26 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x half> %op1 to <4 x i16> ret <4 x i16> %res @@ -1248,16 +1798,43 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptosi <8 x half> %op1 to <8 x i16> @@ -1278,22 +1855,75 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v2.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtzs v3.4s, v3.4s -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i16> @@ -1317,9 +1947,17 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x half> %op1 to <2 x i32> ret <2 x i32> %res @@ -1337,8 +1975,25 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x half> %op1 to <4 x i32> ret <4 x i32> %res @@ -1360,15 +2015,41 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptosi <8 x half> %op1 to <8 x i32> @@ -1398,21 +2079,73 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v2.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtzs v3.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i32> @@ -1433,9 +2166,13 @@ define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzs x8, s0 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x half> %op1 to <1 x i64> ret <1 x i64> %res @@ -1456,14 +2193,18 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvtzs x8, s0 -; NONEON-NOSVE-NEXT: fcvtzs x9, s1 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x half> %op1 to <2 x i64> ret <2 x i64> %res @@ -1490,23 +2231,27 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: mov h1, v0.h[2] -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvtzs x9, s0 -; NONEON-NOSVE-NEXT: fcvtzs x8, s1 -; NONEON-NOSVE-NEXT: fcvtzs x10, s2 -; NONEON-NOSVE-NEXT: fcvtzs x11, s3 -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #16] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fptosi <4 x half> %op1 to <4 x i64> @@ -1549,42 +2294,43 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: mov h1, v0.h[2] -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: mov h4, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov h5, v2.h[2] -; NONEON-NOSVE-NEXT: mov h6, v2.h[3] -; NONEON-NOSVE-NEXT: mov h7, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: str q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzs x9, s0 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvtzs x13, s2 -; NONEON-NOSVE-NEXT: fcvtzs x8, s1 -; NONEON-NOSVE-NEXT: fcvt s1, h7 -; NONEON-NOSVE-NEXT: fcvtzs x10, s3 -; NONEON-NOSVE-NEXT: fcvtzs x11, s4 -; NONEON-NOSVE-NEXT: fcvtzs x12, s5 -; NONEON-NOSVE-NEXT: fcvtzs x14, s6 -; NONEON-NOSVE-NEXT: fmov d3, x13 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: fcvtzs x8, s1 -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d2, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v3.d[1], x8 -; NONEON-NOSVE-NEXT: mov v2.d[1], x14 -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] -; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptosi <8 x half> %op1 to <8 x i64> @@ -1649,76 +2395,79 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s3, h1 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h0 -; NONEON-NOSVE-NEXT: mov h0, v0.h[1] -; NONEON-NOSVE-NEXT: mov h1, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s17, h4 -; NONEON-NOSVE-NEXT: mov h18, v4.h[2] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvtzs x8, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: mov h16, v4.h[3] -; NONEON-NOSVE-NEXT: fcvtzs x9, s6 -; NONEON-NOSVE-NEXT: ldr d6, [sp, #8] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: mov h4, v4.h[1] -; NONEON-NOSVE-NEXT: fcvtzs x11, s2 -; NONEON-NOSVE-NEXT: mov h2, v6.h[2] -; NONEON-NOSVE-NEXT: fcvtzs x10, s17 -; NONEON-NOSVE-NEXT: fcvtzs x13, s5 -; NONEON-NOSVE-NEXT: fcvtzs x12, s3 -; NONEON-NOSVE-NEXT: mov h3, v6.h[3] -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov h5, v6.h[1] -; NONEON-NOSVE-NEXT: fcvt s17, h18 -; NONEON-NOSVE-NEXT: fcvtzs x14, s7 -; NONEON-NOSVE-NEXT: fmov d7, x8 +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 192 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzs x8, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fmov d0, x11 -; NONEON-NOSVE-NEXT: fcvtzs x11, s1 -; NONEON-NOSVE-NEXT: fmov d1, x13 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvtzs x13, s16 -; NONEON-NOSVE-NEXT: fmov d16, x9 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvtzs x15, s17 -; NONEON-NOSVE-NEXT: mov v0.d[1], x12 -; NONEON-NOSVE-NEXT: mov v1.d[1], x14 -; NONEON-NOSVE-NEXT: fcvtzs x9, s2 -; NONEON-NOSVE-NEXT: mov v16.d[1], x8 -; NONEON-NOSVE-NEXT: fcvtzs x8, s6 -; NONEON-NOSVE-NEXT: fcvtzs x14, s4 -; NONEON-NOSVE-NEXT: fcvtzs x12, s3 -; NONEON-NOSVE-NEXT: mov v7.d[1], x11 -; NONEON-NOSVE-NEXT: fmov d3, x10 -; NONEON-NOSVE-NEXT: fcvtzs x11, s5 -; NONEON-NOSVE-NEXT: fmov d2, x15 -; NONEON-NOSVE-NEXT: stp q16, q1, [x1, #64] -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d4, x8 -; NONEON-NOSVE-NEXT: stp q7, q0, [x1] -; NONEON-NOSVE-NEXT: mov v2.d[1], x13 -; NONEON-NOSVE-NEXT: mov v3.d[1], x14 -; NONEON-NOSVE-NEXT: mov v1.d[1], x12 -; NONEON-NOSVE-NEXT: mov v4.d[1], x11 -; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #96] -; NONEON-NOSVE-NEXT: stp q4, q1, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #128] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i64> @@ -1741,7 +2490,14 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i16> ret <2 x i16> %res @@ -1759,8 +2515,20 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x float> %op1 to <4 x i16> ret <4 x i16> %res @@ -1782,10 +2550,31 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i16> @@ -1814,15 +2603,56 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f32_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtzs v2.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x float>, ptr %a %res = fptosi <16 x float> %op1 to <16 x i16> @@ -1845,7 +2675,14 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i32> ret <2 x i32> %res @@ -1862,7 +2699,18 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x float> %op1 to <4 x i32> ret <4 x i32> %res @@ -1880,10 +2728,28 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i32> @@ -1907,9 +2773,13 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f32_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x float> %op1 to <1 x i64> ret <1 x i64> %res @@ -1927,8 +2797,15 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i64> ret <2 x i64> %res @@ -1950,15 +2827,21 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fptosi <4 x float> %op1 to <4 x i64> @@ -1988,21 +2871,33 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i64> @@ -2026,8 +2921,12 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvtzs w8, d0 -; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i16> ret <1 x i16> %res @@ -2045,8 +2944,14 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i16> ret <2 x i16> %res @@ -2079,11 +2984,27 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #78] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w9, [sp, #74] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i16> @@ -2131,19 +3052,49 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] -; NONEON-NOSVE-NEXT: adrp x8, .LCPI61_0 -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: xtn v7.2s, v0.2d -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI61_0] -; NONEON-NOSVE-NEXT: xtn v6.2s, v1.2d -; NONEON-NOSVE-NEXT: xtn v5.2s, v2.2d -; NONEON-NOSVE-NEXT: xtn v4.2s, v3.2d -; NONEON-NOSVE-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: str d2, [sp, #120] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #142] +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: str d0, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #138] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: strh w9, [sp, #134] +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #128] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptosi <8 x double> %op1 to <8 x i16> @@ -2224,31 +3175,90 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] -; NONEON-NOSVE-NEXT: adrp x8, .LCPI62_0 -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v5.2d, v5.2d -; NONEON-NOSVE-NEXT: fcvtzs v4.2d, v4.2d -; NONEON-NOSVE-NEXT: fcvtzs v6.2d, v6.2d -; NONEON-NOSVE-NEXT: fcvtzs v7.2d, v7.2d -; NONEON-NOSVE-NEXT: xtn v19.2s, v0.2d -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI62_0] -; NONEON-NOSVE-NEXT: xtn v23.2s, v3.2d -; NONEON-NOSVE-NEXT: xtn v18.2s, v1.2d -; NONEON-NOSVE-NEXT: xtn v22.2s, v2.2d -; NONEON-NOSVE-NEXT: xtn v17.2s, v5.2d -; NONEON-NOSVE-NEXT: xtn v21.2s, v6.2d -; NONEON-NOSVE-NEXT: xtn v16.2s, v4.2d -; NONEON-NOSVE-NEXT: xtn v20.2s, v7.2d -; NONEON-NOSVE-NEXT: tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b -; NONEON-NOSVE-NEXT: tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: sub sp, sp, #304 +; NONEON-NOSVE-NEXT: str x29, [sp, #288] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 304 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q6, q7, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #96] +; NONEON-NOSVE-NEXT: stp q1, q7, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q6, q4, [sp] +; NONEON-NOSVE-NEXT: stp q5, q3, [sp, #32] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #168] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #232] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #232] +; NONEON-NOSVE-NEXT: str d2, [sp, #248] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #144] +; NONEON-NOSVE-NEXT: strh w9, [sp, #270] +; NONEON-NOSVE-NEXT: strh w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #200] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w9, [sp, #266] +; NONEON-NOSVE-NEXT: strh w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #216] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #192] +; NONEON-NOSVE-NEXT: strh w9, [sp, #262] +; NONEON-NOSVE-NEXT: strh w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #200] +; NONEON-NOSVE-NEXT: str d0, [sp, #296] +; NONEON-NOSVE-NEXT: strh w9, [sp, #258] +; NONEON-NOSVE-NEXT: strh w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #216] +; NONEON-NOSVE-NEXT: strh w9, [sp, #286] +; NONEON-NOSVE-NEXT: strh w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #208] +; NONEON-NOSVE-NEXT: strh w9, [sp, #282] +; NONEON-NOSVE-NEXT: strh w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #300] +; NONEON-NOSVE-NEXT: strh w9, [sp, #278] +; NONEON-NOSVE-NEXT: strh w8, [sp, #274] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #296] +; NONEON-NOSVE-NEXT: strh w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #256] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #304 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptosi <16 x double> %op1 to <16 x i16> @@ -2272,9 +3282,12 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i32> ret <1 x i32> %res @@ -2292,8 +3305,14 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i32> ret <2 x i32> %res @@ -2315,10 +3334,19 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i32> @@ -2347,15 +3375,32 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptosi <8 x double> %op1 to <8 x i32> @@ -2378,8 +3423,12 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvtzs x8, d0 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i64> ret <1 x i64> %res @@ -2396,7 +3445,14 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs x9, d1 +; NONEON-NOSVE-NEXT: fcvtzs x8, d0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i64> ret <2 x i64> %res @@ -2414,10 +3470,20 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs x9, d1 +; NONEON-NOSVE-NEXT: fcvtzs x8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs x9, d1 +; NONEON-NOSVE-NEXT: fcvtzs x8, d0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index d3b09374676556..69661049bcb6f3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -31,10 +31,27 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask ; ; NONEON-NOSVE-LABEL: select_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uzp1 v2.4h, v2.4h, v0.4h -; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: str w10, [sp, #28] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2 ret <2 x half> %sel @@ -57,9 +74,40 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask ; ; NONEON-NOSVE-LABEL: select_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #18] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: sbfx w9, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2 ret <4 x half> %sel @@ -83,10 +131,68 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask ; ; NONEON-NOSVE-LABEL: select_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: shl v2.8h, v2.8h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.8h, v2.8h, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #47] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #45] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: tst w13, #0xffff +; NONEON-NOSVE-NEXT: sbfx w13, w15, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: tst w13, #0xffff +; NONEON-NOSVE-NEXT: sbfx w13, w14, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: tst w13, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: tst w12, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: tst w11, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: tst w10, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2 ret <8 x half> %sel @@ -107,122 +213,126 @@ define void @select_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: mov h17, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h4, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h5, [sp, #4] +; NONEON-NOSVE-NEXT: ldr h16, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: ldr h17, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s6, h4 +; NONEON-NOSVE-NEXT: fcvt s7, h5 +; NONEON-NOSVE-NEXT: ldr h19, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s18, h17 +; NONEON-NOSVE-NEXT: ldr h21, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h22, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s20, h19 +; NONEON-NOSVE-NEXT: ldr h24, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h25, [sp, #34] ; NONEON-NOSVE-NEXT: fcmp s3, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[3] -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcvt s2, h16 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h26, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h27, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h28, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h29, [sp, #44] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, eq ; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w12, eq -; NONEON-NOSVE-NEXT: fcmp s3, s2 -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w11, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: mov h7, v0.h[7] -; NONEON-NOSVE-NEXT: mov h18, v3.h[3] -; NONEON-NOSVE-NEXT: csetm w13, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v3.h[1] -; NONEON-NOSVE-NEXT: mov h5, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcvt s7, h3 +; NONEON-NOSVE-NEXT: ldr h6, [sp, #26] +; NONEON-NOSVE-NEXT: fcsel s1, s5, s4, eq +; NONEON-NOSVE-NEXT: fcmp s18, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h6 +; NONEON-NOSVE-NEXT: fcvt s18, h21 +; NONEON-NOSVE-NEXT: ldr h5, [sp, #28] +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: fcsel s2, s17, s16, eq +; NONEON-NOSVE-NEXT: fcmp s20, s7 +; NONEON-NOSVE-NEXT: fcvt s16, h5 +; NONEON-NOSVE-NEXT: fcvt s17, h22 +; NONEON-NOSVE-NEXT: ldr h7, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h20, [sp, #14] +; NONEON-NOSVE-NEXT: str h1, [sp, #68] +; NONEON-NOSVE-NEXT: fcsel s3, s19, s3, eq +; NONEON-NOSVE-NEXT: fcmp s18, s4 +; NONEON-NOSVE-NEXT: fcvt s19, h7 +; NONEON-NOSVE-NEXT: fcvt s23, h20 +; NONEON-NOSVE-NEXT: ldr h18, [sp, #48] +; NONEON-NOSVE-NEXT: str h2, [sp, #70] +; NONEON-NOSVE-NEXT: fcsel s4, s21, s6, eq ; NONEON-NOSVE-NEXT: fcmp s17, s16 -; NONEON-NOSVE-NEXT: mov h16, v3.h[2] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h17, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w10, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: fcvt s6, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h2 -; NONEON-NOSVE-NEXT: csetm w15, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: fmov s4, w14 -; NONEON-NOSVE-NEXT: csetm w16, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[3] -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s16, h17 -; NONEON-NOSVE-NEXT: mov v4.h[1], w8 ; NONEON-NOSVE-NEXT: fcvt s17, h18 -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: fmov s5, w14 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s16, s7 -; NONEON-NOSVE-NEXT: mov h7, v3.h[4] -; NONEON-NOSVE-NEXT: mov h16, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[2], w12 -; NONEON-NOSVE-NEXT: mov v5.h[1], w16 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s17 -; NONEON-NOSVE-NEXT: mov h17, v2.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: mov h16, v3.h[5] -; NONEON-NOSVE-NEXT: mov v4.h[3], w11 -; NONEON-NOSVE-NEXT: mov v5.h[2], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v3.h[6] -; NONEON-NOSVE-NEXT: mov h7, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v4.h[4], w13 -; NONEON-NOSVE-NEXT: mov v5.h[3], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcmp s17, s16 -; NONEON-NOSVE-NEXT: mov h16, v3.h[7] -; NONEON-NOSVE-NEXT: mov h17, v2.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[4], w8 -; NONEON-NOSVE-NEXT: mov v4.h[5], w9 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: fcvt s6, h16 -; NONEON-NOSVE-NEXT: fcvt s7, h17 -; NONEON-NOSVE-NEXT: mov v5.h[5], w8 -; NONEON-NOSVE-NEXT: mov v4.h[6], w10 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov v5.h[6], w8 -; NONEON-NOSVE-NEXT: mov v4.h[7], w15 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v5.h[7], w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: fcvt s21, h24 +; NONEON-NOSVE-NEXT: ldr h16, [sp, #50] +; NONEON-NOSVE-NEXT: str h3, [sp, #72] +; NONEON-NOSVE-NEXT: fcsel s5, s22, s5, eq +; NONEON-NOSVE-NEXT: fcmp s23, s19 +; NONEON-NOSVE-NEXT: fcvt s22, h16 +; NONEON-NOSVE-NEXT: fcvt s23, h25 +; NONEON-NOSVE-NEXT: ldr h19, [sp, #52] +; NONEON-NOSVE-NEXT: str h4, [sp, #74] +; NONEON-NOSVE-NEXT: fcsel s6, s20, s7, eq +; NONEON-NOSVE-NEXT: fcmp s21, s17 +; NONEON-NOSVE-NEXT: fcvt s20, h19 +; NONEON-NOSVE-NEXT: fcvt s21, h26 +; NONEON-NOSVE-NEXT: ldr h17, [sp, #54] +; NONEON-NOSVE-NEXT: str h5, [sp, #76] +; NONEON-NOSVE-NEXT: fcsel s7, s24, s18, eq +; NONEON-NOSVE-NEXT: fcmp s23, s22 +; NONEON-NOSVE-NEXT: fcvt s22, h17 +; NONEON-NOSVE-NEXT: fcvt s23, h27 +; NONEON-NOSVE-NEXT: ldr h18, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h24, [sp, #40] +; NONEON-NOSVE-NEXT: str h6, [sp, #78] +; NONEON-NOSVE-NEXT: fcsel s16, s25, s16, eq +; NONEON-NOSVE-NEXT: fcmp s21, s20 +; NONEON-NOSVE-NEXT: fcvt s21, h18 +; NONEON-NOSVE-NEXT: fcvt s25, h24 +; NONEON-NOSVE-NEXT: ldr h20, [sp, #58] +; NONEON-NOSVE-NEXT: str h7, [sp, #80] +; NONEON-NOSVE-NEXT: fcsel s19, s26, s19, eq +; NONEON-NOSVE-NEXT: fcmp s23, s22 +; NONEON-NOSVE-NEXT: fcvt s23, h20 +; NONEON-NOSVE-NEXT: fcvt s26, h28 +; NONEON-NOSVE-NEXT: ldr h22, [sp, #60] +; NONEON-NOSVE-NEXT: str h16, [sp, #82] +; NONEON-NOSVE-NEXT: fcsel s17, s27, s17, eq +; NONEON-NOSVE-NEXT: fcmp s25, s21 +; NONEON-NOSVE-NEXT: fcvt s25, h22 +; NONEON-NOSVE-NEXT: fcvt s27, h29 +; NONEON-NOSVE-NEXT: ldr h21, [sp, #62] +; NONEON-NOSVE-NEXT: str h19, [sp, #84] +; NONEON-NOSVE-NEXT: fcsel s18, s24, s18, eq +; NONEON-NOSVE-NEXT: ldr h24, [sp, #46] +; NONEON-NOSVE-NEXT: fcmp s26, s23 +; NONEON-NOSVE-NEXT: fcvt s23, h21 +; NONEON-NOSVE-NEXT: str h17, [sp, #86] +; NONEON-NOSVE-NEXT: fcvt s26, h24 +; NONEON-NOSVE-NEXT: fcsel s20, s28, s20, eq +; NONEON-NOSVE-NEXT: fcmp s27, s25 +; NONEON-NOSVE-NEXT: ldr h25, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h27, [sp] +; NONEON-NOSVE-NEXT: str h18, [sp, #88] +; NONEON-NOSVE-NEXT: fcvt s17, h25 +; NONEON-NOSVE-NEXT: fcvt s18, h27 +; NONEON-NOSVE-NEXT: fcsel s7, s29, s22, eq +; NONEON-NOSVE-NEXT: fcmp s26, s23 +; NONEON-NOSVE-NEXT: str h20, [sp, #90] +; NONEON-NOSVE-NEXT: fcsel s16, s24, s21, eq +; NONEON-NOSVE-NEXT: str h7, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s18, s17 +; NONEON-NOSVE-NEXT: str h16, [sp, #94] +; NONEON-NOSVE-NEXT: fcsel s2, s27, s25, eq +; NONEON-NOSVE-NEXT: str h2, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -249,9 +359,22 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m ; ; NONEON-NOSVE-LABEL: select_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.2s, v2.2s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.2s, v2.2s, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: str d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: sbfx w8, w9, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2 ret <2 x float> %sel @@ -275,10 +398,36 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m ; ; NONEON-NOSVE-LABEL: select_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: shl v2.4s, v2.4s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.4s, v2.4s, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #12] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: sbfx w9, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2 ret <4 x float> %sel @@ -299,14 +448,45 @@ define void @select_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: fcmeq v4.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcmeq v5.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #20] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #4] +; NONEON-NOSVE-NEXT: ldr s4, [sp, #12] +; NONEON-NOSVE-NEXT: ldr s17, [sp] +; NONEON-NOSVE-NEXT: ldp s6, s7, [sp, #36] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, eq +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: ldp s1, s5, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s2, s3, s2, eq +; NONEON-NOSVE-NEXT: ldp s16, s3, [sp, #44] +; NONEON-NOSVE-NEXT: fcmp s4, s1 +; NONEON-NOSVE-NEXT: fcsel s1, s4, s1, eq +; NONEON-NOSVE-NEXT: fcmp s5, s3 +; NONEON-NOSVE-NEXT: ldr s4, [sp, #52] +; NONEON-NOSVE-NEXT: fcsel s3, s5, s3, eq +; NONEON-NOSVE-NEXT: fcmp s6, s4 +; NONEON-NOSVE-NEXT: ldr s5, [sp, #56] +; NONEON-NOSVE-NEXT: stp s2, s1, [sp, #72] +; NONEON-NOSVE-NEXT: fcsel s4, s6, s4, eq +; NONEON-NOSVE-NEXT: fcmp s7, s5 +; NONEON-NOSVE-NEXT: ldr s6, [sp, #60] +; NONEON-NOSVE-NEXT: fcsel s5, s7, s5, eq +; NONEON-NOSVE-NEXT: fcmp s16, s6 +; NONEON-NOSVE-NEXT: ldr s7, [sp, #16] +; NONEON-NOSVE-NEXT: stp s3, s4, [sp, #80] +; NONEON-NOSVE-NEXT: fcsel s6, s16, s6, eq +; NONEON-NOSVE-NEXT: fcmp s17, s7 +; NONEON-NOSVE-NEXT: fcsel s3, s17, s7, eq +; NONEON-NOSVE-NEXT: stp s5, s6, [sp, #88] +; NONEON-NOSVE-NEXT: stp s3, s0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -325,10 +505,13 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> ; ; NONEON-NOSVE-LABEL: select_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: fmov d2, x8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fcsel d0, d0, d1, ne +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2 ret <1 x double> %sel @@ -352,10 +535,23 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> ; ; NONEON-NOSVE-LABEL: select_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: shl v2.2d, v2.2d, #63 -; NONEON-NOSVE-NEXT: cmlt v2.2d, v2.2d, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: sbfx x8, x8, #0, #1 +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: sbfx x8, x9, #0, #1 +; NONEON-NOSVE-NEXT: fcsel d3, d2, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2 ret <2 x double> %sel @@ -376,14 +572,29 @@ define void @select_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: fcmeq v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: fcmeq v5.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d5, d1, [sp] +; NONEON-NOSVE-NEXT: ldp d0, d3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp d4, d2, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, eq +; NONEON-NOSVE-NEXT: fcmp d3, d2 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #56] +; NONEON-NOSVE-NEXT: fcsel d2, d3, d2, eq +; NONEON-NOSVE-NEXT: fcmp d4, d1 +; NONEON-NOSVE-NEXT: ldr d3, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel d1, d4, d1, eq +; NONEON-NOSVE-NEXT: fcmp d5, d3 +; NONEON-NOSVE-NEXT: fcsel d3, d5, d3, eq +; NONEON-NOSVE-NEXT: stp d2, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp d3, d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll index ae97a266c6ff0d..3ba61c3335a64c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll @@ -25,10 +25,21 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x i8> %op1, i8 5, i64 3 ret <4 x i8> %r @@ -50,10 +61,23 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.b[7], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <8 x i8> %op1, i8 5, i64 7 ret <8 x i8> %r @@ -75,8 +99,25 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v16i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.b[15], w8 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <16 x i8> %op1, i8 5, i64 15 ret <16 x i8> %r @@ -98,8 +139,25 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v32i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v1.b[15], w8 +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <32 x i8> %op1, i8 5, i64 31 ret <32 x i8> %r @@ -122,10 +180,18 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x i16> %op1, i16 5, i64 1 ret <2 x i16> %r @@ -147,10 +213,21 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x i16> %op1, i16 5, i64 3 ret <4 x i16> %r @@ -172,8 +249,23 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.h[7], w8 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <8 x i16> %op1, i16 5, i64 7 ret <8 x i16> %r @@ -195,8 +287,23 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v1.h[7], w8 +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <16 x i16> %op1, i16 5, i64 15 ret <16 x i16> %r @@ -219,10 +326,18 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x i32> %op1, i32 5, i64 1 ret <2 x i32> %r @@ -244,8 +359,20 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x i32> %op1, i32 5, i64 3 ret <4 x i32> %r @@ -267,9 +394,20 @@ define <8 x i32> @insertelement_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %r = insertelement <8 x i32> %op1, i32 5, i64 7 @@ -286,8 +424,12 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = insertelement <1 x i64> %op1, i64 5, i64 0 ret <1 x i64> %r @@ -309,8 +451,18 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x i64> %op1, i64 5, i64 1 ret <2 x i64> %r @@ -332,9 +484,18 @@ define <4 x i64> @insertelement_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v1.d[1], x8 +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %r = insertelement <4 x i64> %op1, i64 5, i64 3 @@ -358,11 +519,14 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) { ; NONEON-NOSVE-LABEL: insertelement_v2f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: adrp x8, .LCPI14_0 -; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI14_0 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: ld1r { v1.4h }, [x8] -; NONEON-NOSVE-NEXT: mov v1.h[0], v0.h[0] -; NONEON-NOSVE-NEXT: fmov d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [x8, :lo12:.LCPI14_0] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x half> %op1, half 5.0, i64 1 ret <2 x half> %r @@ -384,11 +548,22 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: adrp x8, .LCPI15_0 -; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI15_0 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [x8, :lo12:.LCPI15_0] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str h1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x half> %op1, half 5.0, i64 3 ret <4 x half> %r @@ -410,9 +585,24 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v8f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: adrp x8, .LCPI16_0 -; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI16_0 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[7], [x8] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [x8, :lo12:.LCPI16_0] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <8 x half> %op1, half 5.0, i64 7 ret <8 x half> %r @@ -434,10 +624,24 @@ define <16 x half> @insertelement_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: adrp x8, .LCPI17_0 -; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI17_0 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[7], [x8] +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldr h1, [x8, :lo12:.LCPI17_0] +; NONEON-NOSVE-NEXT: str h1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str h1, [sp, #46] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %r = insertelement <16 x half> %op1, half 5.0, i64 15 @@ -461,10 +665,18 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov s1, #5.00000000 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov v0.s[1], v1.s[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: mov w8, #1084227584 // =0x40a00000 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s1, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x float> %op1, float 5.0, i64 1 ret <2 x float> %r @@ -486,8 +698,20 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov s1, #5.00000000 -; NONEON-NOSVE-NEXT: mov v0.s[3], v1.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: mov w8, #1084227584 // =0x40a00000 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x float> %op1, float 5.0, i64 3 ret <4 x float> %r @@ -509,9 +733,21 @@ define <8 x float> @insertelement_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov s2, #5.00000000 -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov v1.s[3], v2.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: mov w8, #1084227584 // =0x40a00000 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %r = insertelement <8 x float> %op1, float 5.0, i64 7 @@ -527,8 +763,12 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = insertelement <1 x double> %op1, double 5.0, i64 0 ret <1 x double> %r @@ -550,8 +790,18 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov d1, #5.00000000 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x double> %op1, double 5.0, i64 1 ret <2 x double> %r @@ -573,10 +823,19 @@ define <4 x double> @insertelement_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov d0, #5.00000000 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 ; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: mov v1.d[1], v0.d[0] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #32] ; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %r = insertelement <4 x double> %op1, double 5.0, i64 3 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll index 1b438559e05380..a2875ffef2e88a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -20,7 +20,27 @@ define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: add_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -37,7 +57,43 @@ define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: add_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -54,7 +110,74 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: add_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = add <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -72,11 +195,143 @@ define void @add_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -96,7 +351,18 @@ define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: add_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -113,7 +379,27 @@ define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: add_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -130,7 +416,42 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: add_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = add <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -148,11 +469,79 @@ define void @add_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -172,7 +561,18 @@ define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: add_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -189,7 +589,24 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: add_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = add <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -207,11 +624,43 @@ define void @add_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -231,7 +680,14 @@ define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: add_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = add <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -248,7 +704,17 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: add_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: add x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = add <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -266,11 +732,29 @@ define void @add_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: add v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: add x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: add x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -303,7 +787,27 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -329,7 +833,43 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -355,7 +895,74 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = mul <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -384,11 +991,143 @@ define void @mul_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: mul_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: mul v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: mul v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -417,7 +1156,17 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -443,7 +1192,27 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -469,7 +1238,42 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = mul <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -498,11 +1302,79 @@ define void @mul_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: mul_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: mul v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: mul v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -531,7 +1403,17 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -557,7 +1439,22 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = mul <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -586,11 +1483,39 @@ define void @mul_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: mul_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: mul v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: mul v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -619,12 +1544,14 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: mul x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = mul <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -650,14 +1577,16 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x10, d1 -; NONEON-NOSVE-NEXT: fmov x11, d0 -; NONEON-NOSVE-NEXT: mov x8, v1.d[1] -; NONEON-NOSVE-NEXT: mov x9, v0.d[1] -; NONEON-NOSVE-NEXT: mul x10, x11, x10 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: mul x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: mul x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = mul <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -686,25 +1615,27 @@ define void @mul_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: mul_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: fmov x12, d2 -; NONEON-NOSVE-NEXT: mov x11, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x10, v3.d[1] -; NONEON-NOSVE-NEXT: mov x13, v1.d[1] -; NONEON-NOSVE-NEXT: mov x14, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: mul x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] ; NONEON-NOSVE-NEXT: mul x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov x9, d3 -; NONEON-NOSVE-NEXT: mul x10, x11, x10 -; NONEON-NOSVE-NEXT: mul x9, x12, x9 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: mul x11, x14, x13 -; NONEON-NOSVE-NEXT: fmov d0, x9 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: mul x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: mul x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -728,7 +1659,27 @@ define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -745,7 +1696,43 @@ define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -762,7 +1749,74 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sub <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -780,11 +1834,143 @@ define void @sub_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sub_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: sub v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -804,7 +1990,18 @@ define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -821,7 +2018,27 @@ define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -838,7 +2055,42 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sub <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -856,11 +2108,79 @@ define void @sub_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sub_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -880,7 +2200,18 @@ define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -897,7 +2228,24 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sub <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -915,11 +2263,43 @@ define void @sub_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sub_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -939,7 +2319,14 @@ define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sub <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -956,7 +2343,17 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sub x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sub <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -974,11 +2371,29 @@ define void @sub_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sub_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: sub v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: sub x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sub x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -1003,9 +2418,26 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: abs v0.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: cneg w8, w9, mi +; NONEON-NOSVE-NEXT: cmp w10, #0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: cneg w8, w10, mi +; NONEON-NOSVE-NEXT: cmp w11, #0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: cneg w8, w11, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false) ret <4 x i8> %res @@ -1022,7 +2454,42 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false) ret <8 x i8> %res @@ -1039,7 +2506,74 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false) ret <16 x i8> %res @@ -1057,10 +2591,140 @@ define void @abs_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.16b, v0.16b -; NONEON-NOSVE-NEXT: abs v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false) @@ -1080,9 +2744,17 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: abs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: cneg w9, w9, mi +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false) ret <2 x i16> %res @@ -1099,7 +2771,26 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false) ret <4 x i16> %res @@ -1116,7 +2807,42 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.8h, v0.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false) ret <8 x i16> %res @@ -1134,10 +2860,76 @@ define void @abs_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.8h, v0.8h -; NONEON-NOSVE-NEXT: abs v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false) @@ -1156,7 +2948,17 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) ret <2 x i32> %res @@ -1173,7 +2975,24 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) ret <4 x i32> %res @@ -1191,10 +3010,40 @@ define void @abs_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: abs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) @@ -1213,7 +3062,14 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs d0, d0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false) ret <1 x i64> %res @@ -1230,7 +3086,17 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) ret <2 x i64> %res @@ -1248,10 +3114,26 @@ define void @abs_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: abs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll index ee0ca0e60b5e51..0b4316686fff64 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll @@ -22,7 +22,51 @@ define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <8 x i8> %op1, %op2 %sext = sext <8 x i1> %cmp to <8 x i8> @@ -42,7 +86,90 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <16 x i8> %op1, %op2 %sext = sext <16 x i1> %cmp to <16 x i8> @@ -64,11 +191,175 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cmeq v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -91,7 +382,31 @@ define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <4 x i16> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i16> @@ -111,7 +426,50 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <8 x i16> %op1, %op2 %sext = sext <8 x i1> %cmp to <8 x i16> @@ -133,11 +491,95 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: cmeq v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -160,7 +602,19 @@ define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <2 x i32> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i32> @@ -180,7 +634,26 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <4 x i32> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i32> @@ -202,11 +675,47 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: cmeq v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -229,7 +738,15 @@ define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <1 x i64> %op1, %op2 %sext = sext <1 x i1> %cmp to <1 x i64> @@ -249,7 +766,18 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, eq +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <2 x i64> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i64> @@ -271,11 +799,31 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmeq v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: csetm x10, eq +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #80] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, eq +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -304,13 +852,175 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_ne_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cmeq v1.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: mvn v1.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -337,10 +1047,53 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_sge_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmge v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %op2 = load <8 x i16>, ptr %b @@ -369,11 +1122,95 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_sgt_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmgt v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: cmgt v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -400,10 +1237,29 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_sle_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmge v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w10, le +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, le +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %op2 = load <4 x i32>, ptr %b @@ -432,11 +1288,47 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_slt_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmgt v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: cmgt v1.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -463,10 +1355,21 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_uge_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmhs v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, hs +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, hs +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b @@ -493,10 +1396,21 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_ugt_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmhi v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, hi +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, hi +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b @@ -523,10 +1437,21 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_ule_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmhs v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, ls +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, ls +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b @@ -553,10 +1478,21 @@ define void @icmp_ult_v2i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_ult_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmhi v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, lo +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, lo +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index d79d6c18ed5a6e..e09b1613a54afb 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -28,27 +28,27 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w10, v0.h[0] -; NONEON-NOSVE-NEXT: smov w11, v0.h[2] -; NONEON-NOSVE-NEXT: smov w12, v0.h[3] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #8] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #20] ; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[2] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] ; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.h[2], w10 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #16] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: strh w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -80,41 +80,43 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w10, v0.b[0] -; NONEON-NOSVE-NEXT: smov w11, v0.b[2] -; NONEON-NOSVE-NEXT: smov w12, v0.b[3] -; NONEON-NOSVE-NEXT: smov w13, v0.b[4] -; NONEON-NOSVE-NEXT: smov w14, v0.b[5] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.b[0] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: smov w9, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v2.b[2], w10 -; NONEON-NOSVE-NEXT: smov w10, v0.b[6] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.b[5] -; NONEON-NOSVE-NEXT: mov v2.b[3], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[7] -; NONEON-NOSVE-NEXT: sdiv w8, w14, w13 -; NONEON-NOSVE-NEXT: mov v2.b[4], w12 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.b[6], w9 -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -166,71 +168,74 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w10, v0.b[0] -; NONEON-NOSVE-NEXT: smov w11, v0.b[2] -; NONEON-NOSVE-NEXT: smov w12, v0.b[3] -; NONEON-NOSVE-NEXT: smov w13, v0.b[4] -; NONEON-NOSVE-NEXT: smov w14, v0.b[5] -; NONEON-NOSVE-NEXT: smov w15, v0.b[6] -; NONEON-NOSVE-NEXT: smov w16, v0.b[7] -; NONEON-NOSVE-NEXT: smov w17, v0.b[8] -; NONEON-NOSVE-NEXT: smov w18, v0.b[9] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.b[0] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: smov w9, v1.b[10] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v2.b[2], w10 -; NONEON-NOSVE-NEXT: smov w10, v0.b[10] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.b[5] -; NONEON-NOSVE-NEXT: mov v2.b[3], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[11] -; NONEON-NOSVE-NEXT: sdiv w13, w14, w13 -; NONEON-NOSVE-NEXT: smov w14, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[4], w12 -; NONEON-NOSVE-NEXT: smov w12, v0.b[12] -; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 -; NONEON-NOSVE-NEXT: smov w15, v1.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[5], w13 -; NONEON-NOSVE-NEXT: smov w13, v0.b[13] -; NONEON-NOSVE-NEXT: sdiv w15, w16, w15 -; NONEON-NOSVE-NEXT: smov w16, v1.b[8] -; NONEON-NOSVE-NEXT: mov v2.b[6], w14 -; NONEON-NOSVE-NEXT: sdiv w16, w17, w16 -; NONEON-NOSVE-NEXT: smov w17, v1.b[9] -; NONEON-NOSVE-NEXT: mov v2.b[7], w15 -; NONEON-NOSVE-NEXT: sdiv w8, w18, w17 -; NONEON-NOSVE-NEXT: mov v2.b[8], w16 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[11] -; NONEON-NOSVE-NEXT: mov v2.b[9], w8 -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.b[12] -; NONEON-NOSVE-NEXT: mov v2.b[10], w9 -; NONEON-NOSVE-NEXT: smov w9, v1.b[14] -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[11], w10 -; NONEON-NOSVE-NEXT: smov w10, v1.b[15] -; NONEON-NOSVE-NEXT: sdiv w8, w13, w12 -; NONEON-NOSVE-NEXT: smov w12, v0.b[14] -; NONEON-NOSVE-NEXT: mov v2.b[12], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[15] -; NONEON-NOSVE-NEXT: sdiv w9, w12, w9 -; NONEON-NOSVE-NEXT: mov v2.b[13], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.b[14], w9 -; NONEON-NOSVE-NEXT: mov v2.b[15], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sdiv <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -315,159 +320,143 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sdiv_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str x27, [sp, #-80]! // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w10, v0.b[0] -; NONEON-NOSVE-NEXT: smov w11, v0.b[2] -; NONEON-NOSVE-NEXT: smov w12, v0.b[3] -; NONEON-NOSVE-NEXT: smov w13, v0.b[4] -; NONEON-NOSVE-NEXT: smov w14, v0.b[5] -; NONEON-NOSVE-NEXT: smov w15, v0.b[6] -; NONEON-NOSVE-NEXT: smov w17, v0.b[8] -; NONEON-NOSVE-NEXT: smov w2, v0.b[10] -; NONEON-NOSVE-NEXT: smov w3, v0.b[11] -; NONEON-NOSVE-NEXT: smov w4, v0.b[12] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.b[0] -; NONEON-NOSVE-NEXT: smov w5, v0.b[13] -; NONEON-NOSVE-NEXT: smov w6, v0.b[14] -; NONEON-NOSVE-NEXT: smov w1, v3.b[1] -; NONEON-NOSVE-NEXT: smov w7, v2.b[0] -; NONEON-NOSVE-NEXT: smov w19, v2.b[2] -; NONEON-NOSVE-NEXT: smov w20, v2.b[3] -; NONEON-NOSVE-NEXT: smov w21, v2.b[4] -; NONEON-NOSVE-NEXT: smov w22, v2.b[5] -; NONEON-NOSVE-NEXT: smov w23, v2.b[6] -; NONEON-NOSVE-NEXT: smov w24, v2.b[7] -; NONEON-NOSVE-NEXT: smov w25, v2.b[8] -; NONEON-NOSVE-NEXT: smov w26, v2.b[9] -; NONEON-NOSVE-NEXT: smov w27, v2.b[10] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[2] -; NONEON-NOSVE-NEXT: sdiv w11, w11, w10 -; NONEON-NOSVE-NEXT: smov w10, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s5, w9 -; NONEON-NOSVE-NEXT: smov w9, v3.b[11] -; NONEON-NOSVE-NEXT: mov v5.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w10, w12, w10 -; NONEON-NOSVE-NEXT: smov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v5.b[2], w11 -; NONEON-NOSVE-NEXT: smov w11, v2.b[11] -; NONEON-NOSVE-NEXT: sdiv w13, w13, w12 -; NONEON-NOSVE-NEXT: smov w12, v1.b[5] -; NONEON-NOSVE-NEXT: mov v5.b[3], w10 -; NONEON-NOSVE-NEXT: smov w10, v3.b[12] -; NONEON-NOSVE-NEXT: sdiv w12, w14, w12 -; NONEON-NOSVE-NEXT: smov w14, v1.b[6] -; NONEON-NOSVE-NEXT: mov v5.b[4], w13 -; NONEON-NOSVE-NEXT: smov w13, v2.b[14] -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 -; NONEON-NOSVE-NEXT: smov w14, v1.b[7] -; NONEON-NOSVE-NEXT: smov w15, v0.b[7] -; NONEON-NOSVE-NEXT: mov v5.b[5], w12 -; NONEON-NOSVE-NEXT: smov w12, v2.b[13] -; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 -; NONEON-NOSVE-NEXT: smov w15, v1.b[8] -; NONEON-NOSVE-NEXT: mov v5.b[6], w16 -; NONEON-NOSVE-NEXT: sdiv w18, w17, w15 -; NONEON-NOSVE-NEXT: smov w15, v1.b[9] -; NONEON-NOSVE-NEXT: smov w17, v0.b[9] -; NONEON-NOSVE-NEXT: mov v5.b[7], w14 -; NONEON-NOSVE-NEXT: sdiv w17, w17, w15 -; NONEON-NOSVE-NEXT: smov w15, v1.b[10] -; NONEON-NOSVE-NEXT: mov v5.b[8], w18 -; NONEON-NOSVE-NEXT: sdiv w15, w2, w15 -; NONEON-NOSVE-NEXT: smov w2, v1.b[11] -; NONEON-NOSVE-NEXT: mov v5.b[9], w17 -; NONEON-NOSVE-NEXT: sdiv w2, w3, w2 -; NONEON-NOSVE-NEXT: smov w3, v1.b[12] -; NONEON-NOSVE-NEXT: mov v5.b[10], w15 -; NONEON-NOSVE-NEXT: sdiv w3, w4, w3 -; NONEON-NOSVE-NEXT: smov w4, v1.b[13] -; NONEON-NOSVE-NEXT: mov v5.b[11], w2 -; NONEON-NOSVE-NEXT: sdiv w4, w5, w4 -; NONEON-NOSVE-NEXT: smov w5, v1.b[14] -; NONEON-NOSVE-NEXT: mov v5.b[12], w3 -; NONEON-NOSVE-NEXT: sdiv w5, w6, w5 -; NONEON-NOSVE-NEXT: smov w6, v2.b[1] -; NONEON-NOSVE-NEXT: mov v5.b[13], w4 -; NONEON-NOSVE-NEXT: sdiv w1, w6, w1 -; NONEON-NOSVE-NEXT: smov w6, v3.b[0] -; NONEON-NOSVE-NEXT: mov v5.b[14], w5 -; NONEON-NOSVE-NEXT: sdiv w6, w7, w6 -; NONEON-NOSVE-NEXT: smov w7, v3.b[2] -; NONEON-NOSVE-NEXT: sdiv w7, w19, w7 -; NONEON-NOSVE-NEXT: smov w19, v3.b[3] -; NONEON-NOSVE-NEXT: fmov s4, w6 -; NONEON-NOSVE-NEXT: mov v4.b[1], w1 -; NONEON-NOSVE-NEXT: sdiv w19, w20, w19 -; NONEON-NOSVE-NEXT: smov w20, v3.b[4] -; NONEON-NOSVE-NEXT: mov v4.b[2], w7 -; NONEON-NOSVE-NEXT: sdiv w20, w21, w20 -; NONEON-NOSVE-NEXT: smov w21, v3.b[5] -; NONEON-NOSVE-NEXT: mov v4.b[3], w19 -; NONEON-NOSVE-NEXT: sdiv w21, w22, w21 -; NONEON-NOSVE-NEXT: smov w22, v3.b[6] -; NONEON-NOSVE-NEXT: mov v4.b[4], w20 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w22, w23, w22 -; NONEON-NOSVE-NEXT: smov w23, v3.b[7] -; NONEON-NOSVE-NEXT: mov v4.b[5], w21 -; NONEON-NOSVE-NEXT: sdiv w23, w24, w23 -; NONEON-NOSVE-NEXT: smov w24, v3.b[8] -; NONEON-NOSVE-NEXT: mov v4.b[6], w22 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w24, w25, w24 -; NONEON-NOSVE-NEXT: smov w25, v3.b[9] -; NONEON-NOSVE-NEXT: mov v4.b[7], w23 -; NONEON-NOSVE-NEXT: sdiv w25, w26, w25 -; NONEON-NOSVE-NEXT: smov w26, v3.b[10] -; NONEON-NOSVE-NEXT: mov v4.b[8], w24 -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w8, w27, w26 -; NONEON-NOSVE-NEXT: mov v4.b[9], w25 -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w9, w11, w9 -; NONEON-NOSVE-NEXT: smov w11, v2.b[12] -; NONEON-NOSVE-NEXT: mov v4.b[10], w8 -; NONEON-NOSVE-NEXT: smov w8, v3.b[15] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v3.b[13] -; NONEON-NOSVE-NEXT: mov v4.b[11], w9 -; NONEON-NOSVE-NEXT: smov w9, v1.b[15] -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v3.b[14] -; NONEON-NOSVE-NEXT: mov v4.b[12], w10 -; NONEON-NOSVE-NEXT: smov w10, v0.b[15] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v2.b[15] -; NONEON-NOSVE-NEXT: mov v4.b[13], w11 -; NONEON-NOSVE-NEXT: sdiv w8, w13, w8 -; NONEON-NOSVE-NEXT: mov v4.b[14], w12 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov v4.b[15], w8 -; NONEON-NOSVE-NEXT: mov v5.b[15], w9 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] -; NONEON-NOSVE-NEXT: ldr x27, [sp], #80 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #62] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #61] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #59] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #58] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #57] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #55] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #54] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #53] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #51] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #50] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #49] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -490,19 +479,18 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w10, v0.s[1] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: mov w9, v1.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #8] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -523,25 +511,27 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w10, v0.h[0] -; NONEON-NOSVE-NEXT: smov w11, v0.h[2] -; NONEON-NOSVE-NEXT: smov w12, v0.h[3] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.h[0] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.h[2], w10 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -572,39 +562,42 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w10, v0.h[0] -; NONEON-NOSVE-NEXT: smov w11, v0.h[2] -; NONEON-NOSVE-NEXT: smov w12, v0.h[3] -; NONEON-NOSVE-NEXT: smov w13, v0.h[4] -; NONEON-NOSVE-NEXT: smov w14, v0.h[5] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.h[0] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: smov w9, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[2], w10 -; NONEON-NOSVE-NEXT: smov w10, v0.h[6] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.h[5] -; NONEON-NOSVE-NEXT: mov v2.h[3], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.h[7] -; NONEON-NOSVE-NEXT: sdiv w8, w14, w13 -; NONEON-NOSVE-NEXT: mov v2.h[4], w12 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.h[6], w9 -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -649,75 +642,79 @@ define void @sdiv_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sdiv_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w10, v0.h[0] -; NONEON-NOSVE-NEXT: smov w11, v0.h[2] -; NONEON-NOSVE-NEXT: smov w12, v0.h[3] -; NONEON-NOSVE-NEXT: smov w13, v0.h[4] -; NONEON-NOSVE-NEXT: smov w14, v0.h[5] -; NONEON-NOSVE-NEXT: smov w15, v0.h[6] -; NONEON-NOSVE-NEXT: smov w16, v2.h[1] -; NONEON-NOSVE-NEXT: smov w17, v2.h[0] -; NONEON-NOSVE-NEXT: smov w18, v2.h[2] -; NONEON-NOSVE-NEXT: smov w1, v2.h[3] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.h[0] -; NONEON-NOSVE-NEXT: smov w2, v2.h[4] -; NONEON-NOSVE-NEXT: smov w3, v2.h[5] -; NONEON-NOSVE-NEXT: smov w4, v2.h[6] -; NONEON-NOSVE-NEXT: sdiv w10, w10, w9 -; NONEON-NOSVE-NEXT: smov w9, v1.h[2] -; NONEON-NOSVE-NEXT: sdiv w9, w11, w9 -; NONEON-NOSVE-NEXT: smov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s5, w10 -; NONEON-NOSVE-NEXT: smov w10, v3.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[2], w9 -; NONEON-NOSVE-NEXT: smov w9, v2.h[7] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[3], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.h[7] -; NONEON-NOSVE-NEXT: sdiv w13, w14, w13 -; NONEON-NOSVE-NEXT: smov w14, v1.h[6] -; NONEON-NOSVE-NEXT: mov v5.h[4], w12 -; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 -; NONEON-NOSVE-NEXT: smov w15, v3.h[1] -; NONEON-NOSVE-NEXT: mov v5.h[5], w13 -; NONEON-NOSVE-NEXT: sdiv w15, w16, w15 -; NONEON-NOSVE-NEXT: smov w16, v3.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[6], w14 -; NONEON-NOSVE-NEXT: sdiv w16, w17, w16 -; NONEON-NOSVE-NEXT: smov w17, v3.h[2] -; NONEON-NOSVE-NEXT: sdiv w17, w18, w17 -; NONEON-NOSVE-NEXT: smov w18, v3.h[3] -; NONEON-NOSVE-NEXT: fmov s4, w16 -; NONEON-NOSVE-NEXT: mov v4.h[1], w15 -; NONEON-NOSVE-NEXT: sdiv w18, w1, w18 -; NONEON-NOSVE-NEXT: smov w1, v3.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[2], w17 -; NONEON-NOSVE-NEXT: sdiv w1, w2, w1 -; NONEON-NOSVE-NEXT: smov w2, v3.h[5] -; NONEON-NOSVE-NEXT: mov v4.h[3], w18 -; NONEON-NOSVE-NEXT: sdiv w2, w3, w2 -; NONEON-NOSVE-NEXT: smov w3, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[4], w1 -; NONEON-NOSVE-NEXT: sdiv w8, w4, w3 -; NONEON-NOSVE-NEXT: mov v4.h[5], w2 -; NONEON-NOSVE-NEXT: sdiv w9, w9, w10 -; NONEON-NOSVE-NEXT: smov w10, v1.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[6], w8 -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov v4.h[7], w9 -; NONEON-NOSVE-NEXT: mov v5.h[7], w10 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -738,17 +735,17 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w10, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: mov w9, v1.s[1] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -766,22 +763,22 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w10, s0 -; NONEON-NOSVE-NEXT: mov w11, v0.s[2] -; NONEON-NOSVE-NEXT: mov w12, v0.s[3] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] ; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: fmov w9, s1 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov w10, v1.s[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov w11, v1.s[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.s[2], w10 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -801,41 +798,39 @@ define void @sdiv_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w10, s0 -; NONEON-NOSVE-NEXT: mov w11, v0.s[2] -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w12, v2.s[1] -; NONEON-NOSVE-NEXT: fmov w13, s2 -; NONEON-NOSVE-NEXT: mov w14, v2.s[2] -; NONEON-NOSVE-NEXT: mov w15, v2.s[3] -; NONEON-NOSVE-NEXT: mov w16, v0.s[3] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: fmov w9, s1 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov w10, v1.s[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov w11, v3.s[1] -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: fmov w12, s3 -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: mov w13, v3.s[2] -; NONEON-NOSVE-NEXT: sdiv w13, w14, w13 -; NONEON-NOSVE-NEXT: mov w14, v3.s[3] -; NONEON-NOSVE-NEXT: fmov s0, w12 -; NONEON-NOSVE-NEXT: mov v0.s[1], w11 -; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 -; NONEON-NOSVE-NEXT: mov w15, v1.s[3] -; NONEON-NOSVE-NEXT: fmov s1, w9 -; NONEON-NOSVE-NEXT: mov v0.s[2], w13 -; NONEON-NOSVE-NEXT: mov v1.s[1], w8 -; NONEON-NOSVE-NEXT: mov v1.s[2], w10 -; NONEON-NOSVE-NEXT: sdiv w8, w16, w15 -; NONEON-NOSVE-NEXT: mov v0.s[3], w14 -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -856,12 +851,14 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -879,14 +876,16 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x10, v0.d[1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: sdiv x9, x10, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -906,25 +905,27 @@ define void @sdiv_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x10, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x11, d2 -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] ; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 -; NONEON-NOSVE-NEXT: mov x9, v3.d[1] -; NONEON-NOSVE-NEXT: sdiv x9, x10, x9 -; NONEON-NOSVE-NEXT: fmov x10, d3 -; NONEON-NOSVE-NEXT: sdiv x10, x11, x10 -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: sdiv x11, x12, x11 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -954,33 +955,27 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w10, v0.h[0] -; NONEON-NOSVE-NEXT: umov w11, v0.h[2] -; NONEON-NOSVE-NEXT: umov w12, v0.h[3] -; NONEON-NOSVE-NEXT: and w8, w8, #0xff -; NONEON-NOSVE-NEXT: and w9, w9, #0xff -; NONEON-NOSVE-NEXT: and w10, w10, #0xff -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.h[0] -; NONEON-NOSVE-NEXT: and w11, w11, #0xff -; NONEON-NOSVE-NEXT: and w9, w9, #0xff +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #8] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] ; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[2] -; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] ; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: and w9, w11, #0xff -; NONEON-NOSVE-NEXT: and w11, w12, #0xff -; NONEON-NOSVE-NEXT: udiv w8, w11, w9 -; NONEON-NOSVE-NEXT: mov v0.h[2], w10 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #16] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w11, w12, w11 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: strh w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -1012,41 +1007,43 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w10, v0.b[0] -; NONEON-NOSVE-NEXT: umov w11, v0.b[2] -; NONEON-NOSVE-NEXT: umov w12, v0.b[3] -; NONEON-NOSVE-NEXT: umov w13, v0.b[4] -; NONEON-NOSVE-NEXT: umov w14, v0.b[5] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.b[0] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: umov w9, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v2.b[2], w10 -; NONEON-NOSVE-NEXT: umov w10, v0.b[6] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.b[5] -; NONEON-NOSVE-NEXT: mov v2.b[3], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[7] -; NONEON-NOSVE-NEXT: udiv w8, w14, w13 -; NONEON-NOSVE-NEXT: mov v2.b[4], w12 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: udiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.b[6], w9 -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -1098,71 +1095,74 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w10, v0.b[0] -; NONEON-NOSVE-NEXT: umov w11, v0.b[2] -; NONEON-NOSVE-NEXT: umov w12, v0.b[3] -; NONEON-NOSVE-NEXT: umov w13, v0.b[4] -; NONEON-NOSVE-NEXT: umov w14, v0.b[5] -; NONEON-NOSVE-NEXT: umov w15, v0.b[6] -; NONEON-NOSVE-NEXT: umov w16, v0.b[7] -; NONEON-NOSVE-NEXT: umov w17, v0.b[8] -; NONEON-NOSVE-NEXT: umov w18, v0.b[9] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.b[0] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: umov w9, v1.b[10] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v2.b[2], w10 -; NONEON-NOSVE-NEXT: umov w10, v0.b[10] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.b[5] -; NONEON-NOSVE-NEXT: mov v2.b[3], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[11] -; NONEON-NOSVE-NEXT: udiv w13, w14, w13 -; NONEON-NOSVE-NEXT: umov w14, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[4], w12 -; NONEON-NOSVE-NEXT: umov w12, v0.b[12] -; NONEON-NOSVE-NEXT: udiv w14, w15, w14 -; NONEON-NOSVE-NEXT: umov w15, v1.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[5], w13 -; NONEON-NOSVE-NEXT: umov w13, v0.b[13] -; NONEON-NOSVE-NEXT: udiv w15, w16, w15 -; NONEON-NOSVE-NEXT: umov w16, v1.b[8] -; NONEON-NOSVE-NEXT: mov v2.b[6], w14 -; NONEON-NOSVE-NEXT: udiv w16, w17, w16 -; NONEON-NOSVE-NEXT: umov w17, v1.b[9] -; NONEON-NOSVE-NEXT: mov v2.b[7], w15 -; NONEON-NOSVE-NEXT: udiv w8, w18, w17 -; NONEON-NOSVE-NEXT: mov v2.b[8], w16 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[11] -; NONEON-NOSVE-NEXT: mov v2.b[9], w8 -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.b[12] -; NONEON-NOSVE-NEXT: mov v2.b[10], w9 -; NONEON-NOSVE-NEXT: umov w9, v1.b[14] -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[11], w10 -; NONEON-NOSVE-NEXT: umov w10, v1.b[15] -; NONEON-NOSVE-NEXT: udiv w8, w13, w12 -; NONEON-NOSVE-NEXT: umov w12, v0.b[14] -; NONEON-NOSVE-NEXT: mov v2.b[12], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[15] -; NONEON-NOSVE-NEXT: udiv w9, w12, w9 -; NONEON-NOSVE-NEXT: mov v2.b[13], w8 -; NONEON-NOSVE-NEXT: udiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.b[14], w9 -; NONEON-NOSVE-NEXT: mov v2.b[15], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = udiv <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -1247,159 +1247,143 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: udiv_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str x27, [sp, #-80]! // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w10, v0.b[0] -; NONEON-NOSVE-NEXT: umov w11, v0.b[2] -; NONEON-NOSVE-NEXT: umov w12, v0.b[3] -; NONEON-NOSVE-NEXT: umov w13, v0.b[4] -; NONEON-NOSVE-NEXT: umov w14, v0.b[5] -; NONEON-NOSVE-NEXT: umov w15, v0.b[6] -; NONEON-NOSVE-NEXT: umov w17, v0.b[8] -; NONEON-NOSVE-NEXT: umov w2, v0.b[10] -; NONEON-NOSVE-NEXT: umov w3, v0.b[11] -; NONEON-NOSVE-NEXT: umov w4, v0.b[12] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.b[0] -; NONEON-NOSVE-NEXT: umov w5, v0.b[13] -; NONEON-NOSVE-NEXT: umov w6, v0.b[14] -; NONEON-NOSVE-NEXT: umov w1, v3.b[1] -; NONEON-NOSVE-NEXT: umov w7, v2.b[0] -; NONEON-NOSVE-NEXT: umov w19, v2.b[2] -; NONEON-NOSVE-NEXT: umov w20, v2.b[3] -; NONEON-NOSVE-NEXT: umov w21, v2.b[4] -; NONEON-NOSVE-NEXT: umov w22, v2.b[5] -; NONEON-NOSVE-NEXT: umov w23, v2.b[6] -; NONEON-NOSVE-NEXT: umov w24, v2.b[7] -; NONEON-NOSVE-NEXT: umov w25, v2.b[8] -; NONEON-NOSVE-NEXT: umov w26, v2.b[9] -; NONEON-NOSVE-NEXT: umov w27, v2.b[10] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[2] -; NONEON-NOSVE-NEXT: udiv w11, w11, w10 -; NONEON-NOSVE-NEXT: umov w10, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s5, w9 -; NONEON-NOSVE-NEXT: umov w9, v3.b[11] -; NONEON-NOSVE-NEXT: mov v5.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w10, w12, w10 -; NONEON-NOSVE-NEXT: umov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v5.b[2], w11 -; NONEON-NOSVE-NEXT: umov w11, v2.b[11] -; NONEON-NOSVE-NEXT: udiv w13, w13, w12 -; NONEON-NOSVE-NEXT: umov w12, v1.b[5] -; NONEON-NOSVE-NEXT: mov v5.b[3], w10 -; NONEON-NOSVE-NEXT: umov w10, v3.b[12] -; NONEON-NOSVE-NEXT: udiv w12, w14, w12 -; NONEON-NOSVE-NEXT: umov w14, v1.b[6] -; NONEON-NOSVE-NEXT: mov v5.b[4], w13 -; NONEON-NOSVE-NEXT: umov w13, v2.b[14] -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 -; NONEON-NOSVE-NEXT: umov w14, v1.b[7] -; NONEON-NOSVE-NEXT: umov w15, v0.b[7] -; NONEON-NOSVE-NEXT: mov v5.b[5], w12 -; NONEON-NOSVE-NEXT: umov w12, v2.b[13] -; NONEON-NOSVE-NEXT: udiv w14, w15, w14 -; NONEON-NOSVE-NEXT: umov w15, v1.b[8] -; NONEON-NOSVE-NEXT: mov v5.b[6], w16 -; NONEON-NOSVE-NEXT: udiv w18, w17, w15 -; NONEON-NOSVE-NEXT: umov w15, v1.b[9] -; NONEON-NOSVE-NEXT: umov w17, v0.b[9] -; NONEON-NOSVE-NEXT: mov v5.b[7], w14 -; NONEON-NOSVE-NEXT: udiv w17, w17, w15 -; NONEON-NOSVE-NEXT: umov w15, v1.b[10] -; NONEON-NOSVE-NEXT: mov v5.b[8], w18 -; NONEON-NOSVE-NEXT: udiv w15, w2, w15 -; NONEON-NOSVE-NEXT: umov w2, v1.b[11] -; NONEON-NOSVE-NEXT: mov v5.b[9], w17 -; NONEON-NOSVE-NEXT: udiv w2, w3, w2 -; NONEON-NOSVE-NEXT: umov w3, v1.b[12] -; NONEON-NOSVE-NEXT: mov v5.b[10], w15 -; NONEON-NOSVE-NEXT: udiv w3, w4, w3 -; NONEON-NOSVE-NEXT: umov w4, v1.b[13] -; NONEON-NOSVE-NEXT: mov v5.b[11], w2 -; NONEON-NOSVE-NEXT: udiv w4, w5, w4 -; NONEON-NOSVE-NEXT: umov w5, v1.b[14] -; NONEON-NOSVE-NEXT: mov v5.b[12], w3 -; NONEON-NOSVE-NEXT: udiv w5, w6, w5 -; NONEON-NOSVE-NEXT: umov w6, v2.b[1] -; NONEON-NOSVE-NEXT: mov v5.b[13], w4 -; NONEON-NOSVE-NEXT: udiv w1, w6, w1 -; NONEON-NOSVE-NEXT: umov w6, v3.b[0] -; NONEON-NOSVE-NEXT: mov v5.b[14], w5 -; NONEON-NOSVE-NEXT: udiv w6, w7, w6 -; NONEON-NOSVE-NEXT: umov w7, v3.b[2] -; NONEON-NOSVE-NEXT: udiv w7, w19, w7 -; NONEON-NOSVE-NEXT: umov w19, v3.b[3] -; NONEON-NOSVE-NEXT: fmov s4, w6 -; NONEON-NOSVE-NEXT: mov v4.b[1], w1 -; NONEON-NOSVE-NEXT: udiv w19, w20, w19 -; NONEON-NOSVE-NEXT: umov w20, v3.b[4] -; NONEON-NOSVE-NEXT: mov v4.b[2], w7 -; NONEON-NOSVE-NEXT: udiv w20, w21, w20 -; NONEON-NOSVE-NEXT: umov w21, v3.b[5] -; NONEON-NOSVE-NEXT: mov v4.b[3], w19 -; NONEON-NOSVE-NEXT: udiv w21, w22, w21 -; NONEON-NOSVE-NEXT: umov w22, v3.b[6] -; NONEON-NOSVE-NEXT: mov v4.b[4], w20 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w22, w23, w22 -; NONEON-NOSVE-NEXT: umov w23, v3.b[7] -; NONEON-NOSVE-NEXT: mov v4.b[5], w21 -; NONEON-NOSVE-NEXT: udiv w23, w24, w23 -; NONEON-NOSVE-NEXT: umov w24, v3.b[8] -; NONEON-NOSVE-NEXT: mov v4.b[6], w22 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w24, w25, w24 -; NONEON-NOSVE-NEXT: umov w25, v3.b[9] -; NONEON-NOSVE-NEXT: mov v4.b[7], w23 -; NONEON-NOSVE-NEXT: udiv w25, w26, w25 -; NONEON-NOSVE-NEXT: umov w26, v3.b[10] -; NONEON-NOSVE-NEXT: mov v4.b[8], w24 -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w8, w27, w26 -; NONEON-NOSVE-NEXT: mov v4.b[9], w25 -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w9, w11, w9 -; NONEON-NOSVE-NEXT: umov w11, v2.b[12] -; NONEON-NOSVE-NEXT: mov v4.b[10], w8 -; NONEON-NOSVE-NEXT: umov w8, v3.b[15] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v3.b[13] -; NONEON-NOSVE-NEXT: mov v4.b[11], w9 -; NONEON-NOSVE-NEXT: umov w9, v1.b[15] -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v3.b[14] -; NONEON-NOSVE-NEXT: mov v4.b[12], w10 -; NONEON-NOSVE-NEXT: umov w10, v0.b[15] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v2.b[15] -; NONEON-NOSVE-NEXT: mov v4.b[13], w11 -; NONEON-NOSVE-NEXT: udiv w8, w13, w8 -; NONEON-NOSVE-NEXT: mov v4.b[14], w12 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov v4.b[15], w8 -; NONEON-NOSVE-NEXT: mov v5.b[15], w9 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] -; NONEON-NOSVE-NEXT: ldr x27, [sp], #80 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -1422,18 +1406,18 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w10, v0.s[1] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: mov w9, v1.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #8] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] ; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -1454,25 +1438,27 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w10, v0.h[0] -; NONEON-NOSVE-NEXT: umov w11, v0.h[2] -; NONEON-NOSVE-NEXT: umov w12, v0.h[3] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.h[0] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.h[2], w10 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -1503,39 +1489,42 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w10, v0.h[0] -; NONEON-NOSVE-NEXT: umov w11, v0.h[2] -; NONEON-NOSVE-NEXT: umov w12, v0.h[3] -; NONEON-NOSVE-NEXT: umov w13, v0.h[4] -; NONEON-NOSVE-NEXT: umov w14, v0.h[5] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.h[0] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: umov w9, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[2], w10 -; NONEON-NOSVE-NEXT: umov w10, v0.h[6] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.h[5] -; NONEON-NOSVE-NEXT: mov v2.h[3], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.h[7] -; NONEON-NOSVE-NEXT: udiv w8, w14, w13 -; NONEON-NOSVE-NEXT: mov v2.h[4], w12 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: udiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.h[6], w9 -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = udiv <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -1580,75 +1569,79 @@ define void @udiv_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: udiv_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w10, v0.h[0] -; NONEON-NOSVE-NEXT: umov w11, v0.h[2] -; NONEON-NOSVE-NEXT: umov w12, v0.h[3] -; NONEON-NOSVE-NEXT: umov w13, v0.h[4] -; NONEON-NOSVE-NEXT: umov w14, v0.h[5] -; NONEON-NOSVE-NEXT: umov w15, v0.h[6] -; NONEON-NOSVE-NEXT: umov w16, v2.h[1] -; NONEON-NOSVE-NEXT: umov w17, v2.h[0] -; NONEON-NOSVE-NEXT: umov w18, v2.h[2] -; NONEON-NOSVE-NEXT: umov w1, v2.h[3] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.h[0] -; NONEON-NOSVE-NEXT: umov w2, v2.h[4] -; NONEON-NOSVE-NEXT: umov w3, v2.h[5] -; NONEON-NOSVE-NEXT: umov w4, v2.h[6] -; NONEON-NOSVE-NEXT: udiv w10, w10, w9 -; NONEON-NOSVE-NEXT: umov w9, v1.h[2] -; NONEON-NOSVE-NEXT: udiv w9, w11, w9 -; NONEON-NOSVE-NEXT: umov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s5, w10 -; NONEON-NOSVE-NEXT: umov w10, v3.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[2], w9 -; NONEON-NOSVE-NEXT: umov w9, v2.h[7] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[3], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.h[7] -; NONEON-NOSVE-NEXT: udiv w13, w14, w13 -; NONEON-NOSVE-NEXT: umov w14, v1.h[6] -; NONEON-NOSVE-NEXT: mov v5.h[4], w12 -; NONEON-NOSVE-NEXT: udiv w14, w15, w14 -; NONEON-NOSVE-NEXT: umov w15, v3.h[1] -; NONEON-NOSVE-NEXT: mov v5.h[5], w13 -; NONEON-NOSVE-NEXT: udiv w15, w16, w15 -; NONEON-NOSVE-NEXT: umov w16, v3.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[6], w14 -; NONEON-NOSVE-NEXT: udiv w16, w17, w16 -; NONEON-NOSVE-NEXT: umov w17, v3.h[2] -; NONEON-NOSVE-NEXT: udiv w17, w18, w17 -; NONEON-NOSVE-NEXT: umov w18, v3.h[3] -; NONEON-NOSVE-NEXT: fmov s4, w16 -; NONEON-NOSVE-NEXT: mov v4.h[1], w15 -; NONEON-NOSVE-NEXT: udiv w18, w1, w18 -; NONEON-NOSVE-NEXT: umov w1, v3.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[2], w17 -; NONEON-NOSVE-NEXT: udiv w1, w2, w1 -; NONEON-NOSVE-NEXT: umov w2, v3.h[5] -; NONEON-NOSVE-NEXT: mov v4.h[3], w18 -; NONEON-NOSVE-NEXT: udiv w2, w3, w2 -; NONEON-NOSVE-NEXT: umov w3, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[4], w1 -; NONEON-NOSVE-NEXT: udiv w8, w4, w3 -; NONEON-NOSVE-NEXT: mov v4.h[5], w2 -; NONEON-NOSVE-NEXT: udiv w9, w9, w10 -; NONEON-NOSVE-NEXT: umov w10, v1.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[6], w8 -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov v4.h[7], w9 -; NONEON-NOSVE-NEXT: mov v5.h[7], w10 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -1669,17 +1662,17 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w10, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: mov w9, v1.s[1] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -1697,22 +1690,22 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w10, s0 -; NONEON-NOSVE-NEXT: mov w11, v0.s[2] -; NONEON-NOSVE-NEXT: mov w12, v0.s[3] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] ; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: fmov w9, s1 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov w10, v1.s[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov w11, v1.s[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: udiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.s[2], w10 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = udiv <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -1732,41 +1725,39 @@ define void @udiv_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: udiv_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w10, s0 -; NONEON-NOSVE-NEXT: mov w11, v0.s[2] -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w12, v2.s[1] -; NONEON-NOSVE-NEXT: fmov w13, s2 -; NONEON-NOSVE-NEXT: mov w14, v2.s[2] -; NONEON-NOSVE-NEXT: mov w15, v2.s[3] -; NONEON-NOSVE-NEXT: mov w16, v0.s[3] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: fmov w9, s1 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov w10, v1.s[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov w11, v3.s[1] -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: fmov w12, s3 -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: mov w13, v3.s[2] -; NONEON-NOSVE-NEXT: udiv w13, w14, w13 -; NONEON-NOSVE-NEXT: mov w14, v3.s[3] -; NONEON-NOSVE-NEXT: fmov s0, w12 -; NONEON-NOSVE-NEXT: mov v0.s[1], w11 -; NONEON-NOSVE-NEXT: udiv w14, w15, w14 -; NONEON-NOSVE-NEXT: mov w15, v1.s[3] -; NONEON-NOSVE-NEXT: fmov s1, w9 -; NONEON-NOSVE-NEXT: mov v0.s[2], w13 -; NONEON-NOSVE-NEXT: mov v1.s[1], w8 -; NONEON-NOSVE-NEXT: mov v1.s[2], w10 -; NONEON-NOSVE-NEXT: udiv w8, w16, w15 -; NONEON-NOSVE-NEXT: mov v0.s[3], w14 -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -1787,12 +1778,14 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: udiv x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = udiv <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -1810,14 +1803,16 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x10, v0.d[1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv x8, x9, x8 -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: udiv x9, x10, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = udiv <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -1837,25 +1832,27 @@ define void @udiv_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: udiv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x10, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x11, d2 -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv x8, x9, x8 -; NONEON-NOSVE-NEXT: mov x9, v3.d[1] -; NONEON-NOSVE-NEXT: udiv x9, x10, x9 -; NONEON-NOSVE-NEXT: fmov x10, d3 -; NONEON-NOSVE-NEXT: udiv x10, x11, x10 -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: udiv x11, x12, x11 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -1905,23 +1902,66 @@ define void @udiv_constantsplat_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #8969 // =0x2309 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] ; NONEON-NOSVE-NEXT: movk w8, #22765, lsl #16 -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: umull2 v3.2d, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umull v4.2d, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: umull2 v5.2d, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: umull v0.2d, v2.2s, v0.2s -; NONEON-NOSVE-NEXT: uzp2 v3.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v5.4s -; NONEON-NOSVE-NEXT: sub v1.4s, v1.4s, v3.4s -; NONEON-NOSVE-NEXT: sub v2.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: usra v3.4s, v1.4s, #1 -; NONEON-NOSVE-NEXT: usra v0.4s, v2.4s, #1 -; NONEON-NOSVE-NEXT: ushr v1.4s, v3.4s, #6 -; NONEON-NOSVE-NEXT: ushr v0.4s, v0.4s, #6 -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w11, w9, #6 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w9, w9, #6 +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w11, w9, #6 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w9, w9, #6 +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w11, w9, #6 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w9, w9, #6 +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w11, w9, #6 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: umull x8, w9, w8 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w8 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w8, w8, #6 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = udiv <8 x i32> %op1, diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll index 9f8511b00c6ed1..2c2b79121ef820 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll @@ -30,18 +30,50 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i1_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: shl v0.4s, v0.4s, #31 -; NONEON-NOSVE-NEXT: shl v1.4s, v1.4s, #31 -; NONEON-NOSVE-NEXT: cmlt v0.4s, v0.4s, #0 -; NONEON-NOSVE-NEXT: cmlt v1.4s, v1.4s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #34] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #38] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #72] +; NONEON-NOSVE-NEXT: sbfx w8, w14, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w15, #0, #1 +; NONEON-NOSVE-NEXT: stp w8, w12, [sp, #64] +; NONEON-NOSVE-NEXT: sbfx w12, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: stp w12, w10, [sp, #56] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i1> %a to <8 x i32> store <8 x i32> %b, ptr %out @@ -73,17 +105,21 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) { ; NONEON-NOSVE-LABEL: sext_v4i3_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: shl v0.2d, v0.2d, #61 -; NONEON-NOSVE-NEXT: shl v1.2d, v1.2d, #61 -; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #61 -; NONEON-NOSVE-NEXT: sshr v1.2d, v1.2d, #61 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #16] +; NONEON-NOSVE-NEXT: sbfx x8, x8, #0, #3 +; NONEON-NOSVE-NEXT: sbfx x9, x9, #0, #3 +; NONEON-NOSVE-NEXT: sbfx x10, x10, #0, #3 +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #48] +; NONEON-NOSVE-NEXT: sbfx x8, x11, #0, #3 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <4 x i3> %a to <4 x i64> store <4 x i64> %b, ptr %out @@ -106,13 +142,45 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <16 x i8> %a to <16 x i16> store <16 x i16>%b, ptr %out @@ -138,20 +206,206 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 272 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #94] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #92] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #86] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #83] +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #82] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #81] +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #111] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #107] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #105] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #104] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #99] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #97] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #144] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -177,14 +431,42 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i8_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #36] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i8> %a to <8 x i32> store <8 x i32>%b, ptr %out @@ -210,21 +492,75 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp, #-160]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %b = sext <16 x i8> %a to <16 x i32> store <16 x i32> %b, ptr %out @@ -263,36 +599,280 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] -; NONEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0 -; NONEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: sshll v0.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v6.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: sshll v1.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v7.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: sub sp, sp, #464 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #416] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #432] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #448] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 464 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #90] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #448] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #432] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #82] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #86] +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #85] +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #107] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #105] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #182] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #111] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #99] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #97] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #198] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #196] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #194] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #192] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #206] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #204] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #202] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #200] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #180] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #272] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #178] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #176] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #190] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #188] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #186] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #184] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #230] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #240] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #228] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #226] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #238] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #236] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #234] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #232] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #214] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #336] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #212] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #210] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #222] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #220] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #218] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #216] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #304] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #464 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -325,17 +905,19 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) { ; NONEON-NOSVE-LABEL: sext_v4i8_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: shl v0.2d, v0.2d, #56 -; NONEON-NOSVE-NEXT: shl v1.2d, v1.2d, #56 -; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #56 -; NONEON-NOSVE-NEXT: sshr v1.2d, v1.2d, #56 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb x8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb x10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb x11, [sp, #20] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #48] +; NONEON-NOSVE-NEXT: stp x10, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <4 x i8> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -362,22 +944,57 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i8_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: sub sp, sp, #176 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 176 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: add x8, sp, #144 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w9, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #48] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #80] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #96] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #144] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #104] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #160] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #80] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #112] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #88] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #112] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x8] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #176 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i8> %a to <8 x i64> store <8 x i64>%b, ptr %out @@ -419,37 +1036,109 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-112]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 112 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #40] -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #48] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #80] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #72] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #104] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q5, [x0, #64] -; NONEON-NOSVE-NEXT: sshll v1.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q4, [x0] -; NONEON-NOSVE-NEXT: sshll v0.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #96] -; NONEON-NOSVE-NEXT: stp q0, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #112 +; NONEON-NOSVE-NEXT: sub sp, sp, #368 +; NONEON-NOSVE-NEXT: str x29, [sp, #352] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 368 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #352] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #102] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #100] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: str d0, [sp, #360] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #192] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #104] +; NONEON-NOSVE-NEXT: str d2, [sp, #168] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #216] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #320] +; NONEON-NOSVE-NEXT: ldrsw x9, [sp, #364] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #360] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #336] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #200] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #320] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #288] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #208] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #304] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #184] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #288] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #256] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #192] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #272] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #256] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #224] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #176] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #240] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #224] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q3, q4, [x0, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x0, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x0, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #368 ; NONEON-NOSVE-NEXT: ret %b = sext <16 x i8> %a to <16 x i64> store <16 x i64> %b, ptr %out @@ -522,69 +1211,367 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #224 -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp] -; NONEON-NOSVE-NEXT: sshll v5.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v6.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v3.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v4.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: stp q3, q5, [sp, #32] -; NONEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] -; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #64] -; NONEON-NOSVE-NEXT: sshll v6.4s, v6.4h, #0 -; NONEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v7.4s, v7.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q5, [sp, #128] -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ldr d19, [sp, #152] -; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #96] -; NONEON-NOSVE-NEXT: ldr d20, [sp, #136] -; NONEON-NOSVE-NEXT: stp q1, q4, [sp, #160] -; NONEON-NOSVE-NEXT: ldr d17, [sp, #104] -; NONEON-NOSVE-NEXT: ldr d21, [sp, #120] -; NONEON-NOSVE-NEXT: stp q7, q6, [sp, #192] -; NONEON-NOSVE-NEXT: sshll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: sshll v19.2d, v19.2s, #0 -; NONEON-NOSVE-NEXT: ldr d16, [sp, #216] -; NONEON-NOSVE-NEXT: ldr d22, [sp, #200] -; NONEON-NOSVE-NEXT: ldr d23, [sp, #184] -; NONEON-NOSVE-NEXT: ldr d18, [sp, #168] -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: sshll v16.2d, v16.2s, #0 -; NONEON-NOSVE-NEXT: stp q5, q19, [x1] -; NONEON-NOSVE-NEXT: sshll v5.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: sshll v7.2d, v22.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: stp q6, q16, [x1, #128] -; NONEON-NOSVE-NEXT: sshll v6.2d, v23.2s, #0 -; NONEON-NOSVE-NEXT: stp q5, q7, [x1, #160] -; NONEON-NOSVE-NEXT: sshll v5.2d, v20.2s, #0 -; NONEON-NOSVE-NEXT: stp q4, q6, [x1, #192] -; NONEON-NOSVE-NEXT: sshll v4.2d, v21.2s, #0 -; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #32] -; NONEON-NOSVE-NEXT: sshll v2.2d, v17.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #64] -; NONEON-NOSVE-NEXT: sshll v3.2d, v18.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #96] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #224] -; NONEON-NOSVE-NEXT: add sp, sp, #224 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #752 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 848 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #90] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #83] +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #82] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #81] +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #87] +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #86] +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #85] +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #107] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #105] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #178] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #111] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #99] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #97] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #194] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #192] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #198] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #196] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #202] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #272] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #200] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #206] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #400] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #204] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #176] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #288] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #182] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #180] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #186] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #416] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #240] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #184] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #190] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #368] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #188] +; NONEON-NOSVE-NEXT: ldrsw x9, [sp, #372] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #226] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #256] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #230] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #384] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #228] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #234] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #336] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #232] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #238] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #464] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #236] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #352] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #214] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #480] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #212] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #218] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #304] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #216] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #222] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #432] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #220] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #404] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #320] +; NONEON-NOSVE-NEXT: str x8, [sp, #568] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #400] +; NONEON-NOSVE-NEXT: str x8, [sp, #560] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #412] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #448] +; NONEON-NOSVE-NEXT: str x8, [sp, #584] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #408] +; NONEON-NOSVE-NEXT: str x8, [sp, #576] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #420] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #560] +; NONEON-NOSVE-NEXT: str x8, [sp, #600] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #416] +; NONEON-NOSVE-NEXT: str x8, [sp, #592] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #428] +; NONEON-NOSVE-NEXT: str x8, [sp, #616] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #424] +; NONEON-NOSVE-NEXT: str x8, [sp, #608] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #368] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #592] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #496] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #380] +; NONEON-NOSVE-NEXT: str x8, [sp, #520] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #376] +; NONEON-NOSVE-NEXT: str x8, [sp, #512] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #388] +; NONEON-NOSVE-NEXT: ldp q4, q5, [sp, #496] +; NONEON-NOSVE-NEXT: str x8, [sp, #536] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #384] +; NONEON-NOSVE-NEXT: str x8, [sp, #528] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #396] +; NONEON-NOSVE-NEXT: str x8, [sp, #552] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #392] +; NONEON-NOSVE-NEXT: str x8, [sp, #544] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #468] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #528] +; NONEON-NOSVE-NEXT: str x8, [sp, #696] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #464] +; NONEON-NOSVE-NEXT: str x8, [sp, #688] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #476] +; NONEON-NOSVE-NEXT: str x8, [sp, #712] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #472] +; NONEON-NOSVE-NEXT: str x8, [sp, #704] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #484] +; NONEON-NOSVE-NEXT: ldp q16, q17, [sp, #688] +; NONEON-NOSVE-NEXT: str x8, [sp, #728] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #480] +; NONEON-NOSVE-NEXT: str x8, [sp, #720] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #492] +; NONEON-NOSVE-NEXT: str x8, [sp, #744] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #488] +; NONEON-NOSVE-NEXT: str x8, [sp, #736] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #436] +; NONEON-NOSVE-NEXT: ldp q19, q20, [sp, #720] +; NONEON-NOSVE-NEXT: str x8, [sp, #632] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #432] +; NONEON-NOSVE-NEXT: str x8, [sp, #624] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #444] +; NONEON-NOSVE-NEXT: str x8, [sp, #648] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #440] +; NONEON-NOSVE-NEXT: str x8, [sp, #640] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #452] +; NONEON-NOSVE-NEXT: ldp q22, q23, [sp, #624] +; NONEON-NOSVE-NEXT: str x8, [sp, #664] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #448] +; NONEON-NOSVE-NEXT: str x8, [sp, #656] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #460] +; NONEON-NOSVE-NEXT: str x8, [sp, #680] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #456] +; NONEON-NOSVE-NEXT: str x8, [sp, #672] +; NONEON-NOSVE-NEXT: ldp q21, q18, [sp, #656] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1, #32] +; NONEON-NOSVE-NEXT: stp q4, q5, [x1, #64] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #96] +; NONEON-NOSVE-NEXT: stp q16, q17, [x1, #128] +; NONEON-NOSVE-NEXT: stp q19, q20, [x1, #160] +; NONEON-NOSVE-NEXT: stp q22, q23, [x1, #192] +; NONEON-NOSVE-NEXT: stp q21, q18, [x1, #224] +; NONEON-NOSVE-NEXT: add sp, sp, #752 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -609,13 +1596,25 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i16_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i16> %a to <8 x i32> store <8 x i32>%b, ptr %out @@ -640,20 +1639,91 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i16_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: add w13, w13, w13 +; NONEON-NOSVE-NEXT: add w14, w14, w14 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w14, [sp, #46] +; NONEON-NOSVE-NEXT: add w14, w3, w3 +; NONEON-NOSVE-NEXT: strh w13, [sp, #44] +; NONEON-NOSVE-NEXT: add w13, w5, w5 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w14, [sp, #42] +; NONEON-NOSVE-NEXT: add w14, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w13, [sp, #40] +; NONEON-NOSVE-NEXT: add w13, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #22] +; NONEON-NOSVE-NEXT: strh w14, [sp, #38] +; NONEON-NOSVE-NEXT: add w14, w0, w0 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w13, [sp, #36] +; NONEON-NOSVE-NEXT: add w13, w18, w18 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w14, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #30] +; NONEON-NOSVE-NEXT: strh w13, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #18] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #20] +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: add w14, w17, w17 +; NONEON-NOSVE-NEXT: add w12, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w13, w16, w16 +; NONEON-NOSVE-NEXT: add w11, w11, w11 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: add w10, w10, w10 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: strh w14, [sp, #62] +; NONEON-NOSVE-NEXT: add w14, w15, w15 +; NONEON-NOSVE-NEXT: strh w13, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: strh w14, [sp, #58] +; NONEON-NOSVE-NEXT: strh w12, [sp, #56] +; NONEON-NOSVE-NEXT: strh w11, [sp, #54] +; NONEON-NOSVE-NEXT: strh w10, [sp, #52] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a @@ -679,14 +1749,24 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v4i16_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #40] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = sext <4 x i16> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -712,21 +1792,39 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i16_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp, #-160]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #88] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #144] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #128] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #72] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i16> %a to <8 x i64> store <8 x i64>%b, ptr %out @@ -765,36 +1863,124 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i16_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: sshll v0.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: sshll v1.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: sub sp, sp, #368 +; NONEON-NOSVE-NEXT: str x29, [sp, #352] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 368 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #352] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: add w13, w13, w13 +; NONEON-NOSVE-NEXT: add w14, w14, w14 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w14, [sp, #54] +; NONEON-NOSVE-NEXT: add w14, w3, w3 +; NONEON-NOSVE-NEXT: strh w13, [sp, #52] +; NONEON-NOSVE-NEXT: add w13, w5, w5 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w14, [sp, #50] +; NONEON-NOSVE-NEXT: add w14, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w13, [sp, #48] +; NONEON-NOSVE-NEXT: add w13, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #22] +; NONEON-NOSVE-NEXT: strh w14, [sp, #46] +; NONEON-NOSVE-NEXT: add w14, w0, w0 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w13, [sp, #44] +; NONEON-NOSVE-NEXT: add w13, w18, w18 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w14, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #30] +; NONEON-NOSVE-NEXT: strh w13, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #18] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #20] +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: add w14, w17, w17 +; NONEON-NOSVE-NEXT: add w12, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w13, w16, w16 +; NONEON-NOSVE-NEXT: add w11, w11, w11 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: add w10, w10, w10 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w14, [sp, #70] +; NONEON-NOSVE-NEXT: add w14, w15, w15 +; NONEON-NOSVE-NEXT: strh w13, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w14, [sp, #66] +; NONEON-NOSVE-NEXT: strh w12, [sp, #64] +; NONEON-NOSVE-NEXT: strh w11, [sp, #62] +; NONEON-NOSVE-NEXT: strh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #98] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #102] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #100] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #184] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: str d0, [sp, #360] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: str d2, [sp, #200] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #184] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #256] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #192] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #272] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #256] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #224] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #176] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #240] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #216] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #224] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #320] +; NONEON-NOSVE-NEXT: ldrsw x9, [sp, #364] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #360] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #336] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #200] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #320] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #288] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #208] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #304] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #288] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #368 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a @@ -819,13 +2005,17 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v4i32_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #24] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <4 x i32> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -850,20 +2040,43 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i32_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #16] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #72] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #144] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = add <8 x i32> %a, %a @@ -888,13 +2101,45 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = zext <16 x i8> %a to <16 x i16> store <16 x i16>%b, ptr %out @@ -920,20 +2165,206 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 272 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #94] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #86] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #83] +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #82] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #81] +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #111] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #107] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #105] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #104] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #99] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #97] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #144] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -959,14 +2390,42 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i8_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = zext <8 x i8> %a to <8 x i32> store <8 x i32>%b, ptr %out @@ -992,21 +2451,75 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp, #-160]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #94] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %b = zext <16 x i8> %a to <16 x i32> store <16 x i32> %b, ptr %out @@ -1045,36 +2558,280 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0 -; NONEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: ushll v0.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v6.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: ushll v1.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v7.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: sub sp, sp, #464 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #416] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #432] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #448] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 464 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #90] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #448] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #432] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #82] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #86] +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #85] +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #107] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #105] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #182] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #111] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #99] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #97] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #198] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #196] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #194] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #192] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #206] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #204] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #202] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #200] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #180] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #272] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #178] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #176] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #190] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #188] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #186] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #184] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #230] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #240] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #228] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #226] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #238] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #236] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #234] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #232] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #214] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #336] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #212] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #210] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #222] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #220] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #218] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #216] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #304] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #464 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -1104,16 +2861,26 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v4i8_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = zext <4 x i8> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -1140,22 +2907,61 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i8_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: sub sp, sp, #176 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 176 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: add x8, sp, #144 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w9, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #96] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #48] +; NONEON-NOSVE-NEXT: stp w10, wzr, [sp, #152] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #144] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #104] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #80] +; NONEON-NOSVE-NEXT: stp w10, wzr, [sp, #168] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #160] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #80] +; NONEON-NOSVE-NEXT: stp w10, wzr, [sp, #120] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #88] +; NONEON-NOSVE-NEXT: stp w10, wzr, [sp, #136] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #112] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x8] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #176 ; NONEON-NOSVE-NEXT: ret %b = zext <8 x i8> %a to <8 x i64> store <8 x i64>%b, ptr %out @@ -1197,37 +3003,129 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-112]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 112 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #40] -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #48] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #80] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #104] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] -; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q5, [x0, #64] -; NONEON-NOSVE-NEXT: ushll v1.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q4, [x0] -; NONEON-NOSVE-NEXT: ushll v0.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #96] -; NONEON-NOSVE-NEXT: stp q0, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #112 +; NONEON-NOSVE-NEXT: sub sp, sp, #368 +; NONEON-NOSVE-NEXT: str x29, [sp, #352] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 368 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #352] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str wzr, [sp, #332] +; NONEON-NOSVE-NEXT: str wzr, [sp, #324] +; NONEON-NOSVE-NEXT: str wzr, [sp, #348] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: str wzr, [sp, #340] +; NONEON-NOSVE-NEXT: str wzr, [sp, #300] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: str wzr, [sp, #292] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: str wzr, [sp, #316] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: str wzr, [sp, #308] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: str wzr, [sp, #268] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: str wzr, [sp, #260] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: str wzr, [sp, #284] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: str wzr, [sp, #276] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #102] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #100] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: str d0, [sp, #360] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #76] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #216] +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #192] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #104] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #364] +; NONEON-NOSVE-NEXT: str w9, [sp, #328] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #360] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #200] +; NONEON-NOSVE-NEXT: str d2, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #320] +; NONEON-NOSVE-NEXT: str w9, [sp, #296] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #208] +; NONEON-NOSVE-NEXT: str w9, [sp, #312] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #288] +; NONEON-NOSVE-NEXT: str w9, [sp, #264] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #252] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #192] +; NONEON-NOSVE-NEXT: str w9, [sp, #280] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #256] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #224] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: stp wzr, w9, [sp, #244] +; NONEON-NOSVE-NEXT: str w8, [sp, #240] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #224] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q3, q4, [x0, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x0, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x0, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #368 ; NONEON-NOSVE-NEXT: ret %b = zext <16 x i8> %a to <16 x i64> store <16 x i64> %b, ptr %out @@ -1300,69 +3198,400 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #224 -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp] -; NONEON-NOSVE-NEXT: ushll v5.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v6.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v3.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v4.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: stp q3, q5, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] -; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #64] -; NONEON-NOSVE-NEXT: ushll v6.4s, v6.4h, #0 -; NONEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v7.4s, v7.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q5, [sp, #128] -; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ldr d19, [sp, #152] -; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #96] -; NONEON-NOSVE-NEXT: ldr d20, [sp, #136] -; NONEON-NOSVE-NEXT: stp q1, q4, [sp, #160] -; NONEON-NOSVE-NEXT: ldr d17, [sp, #104] -; NONEON-NOSVE-NEXT: ldr d21, [sp, #120] -; NONEON-NOSVE-NEXT: stp q7, q6, [sp, #192] -; NONEON-NOSVE-NEXT: ushll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: ushll v19.2d, v19.2s, #0 -; NONEON-NOSVE-NEXT: ldr d16, [sp, #216] -; NONEON-NOSVE-NEXT: ldr d22, [sp, #200] -; NONEON-NOSVE-NEXT: ldr d23, [sp, #184] -; NONEON-NOSVE-NEXT: ldr d18, [sp, #168] -; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v16.2d, v16.2s, #0 -; NONEON-NOSVE-NEXT: stp q5, q19, [x1] -; NONEON-NOSVE-NEXT: ushll v5.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: ushll v7.2d, v22.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: stp q6, q16, [x1, #128] -; NONEON-NOSVE-NEXT: ushll v6.2d, v23.2s, #0 -; NONEON-NOSVE-NEXT: stp q5, q7, [x1, #160] -; NONEON-NOSVE-NEXT: ushll v5.2d, v20.2s, #0 -; NONEON-NOSVE-NEXT: stp q4, q6, [x1, #192] -; NONEON-NOSVE-NEXT: ushll v4.2d, v21.2s, #0 -; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #32] -; NONEON-NOSVE-NEXT: ushll v2.2d, v17.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #64] -; NONEON-NOSVE-NEXT: ushll v3.2d, v18.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #96] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #224] -; NONEON-NOSVE-NEXT: add sp, sp, #224 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #752 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 848 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str wzr, [sp, #572] +; NONEON-NOSVE-NEXT: str wzr, [sp, #564] +; NONEON-NOSVE-NEXT: str wzr, [sp, #588] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #90] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str wzr, [sp, #580] +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: str wzr, [sp, #604] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #83] +; NONEON-NOSVE-NEXT: str wzr, [sp, #596] +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #82] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #81] +; NONEON-NOSVE-NEXT: str wzr, [sp, #620] +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #87] +; NONEON-NOSVE-NEXT: str wzr, [sp, #612] +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #86] +; NONEON-NOSVE-NEXT: str wzr, [sp, #508] +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #85] +; NONEON-NOSVE-NEXT: str wzr, [sp, #500] +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: str wzr, [sp, #524] +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #107] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #112] +; NONEON-NOSVE-NEXT: str wzr, [sp, #516] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #106] +; NONEON-NOSVE-NEXT: str wzr, [sp, #540] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #105] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #178] +; NONEON-NOSVE-NEXT: str wzr, [sp, #532] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #111] +; NONEON-NOSVE-NEXT: str wzr, [sp, #556] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #110] +; NONEON-NOSVE-NEXT: str wzr, [sp, #548] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #109] +; NONEON-NOSVE-NEXT: str wzr, [sp, #700] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #108] +; NONEON-NOSVE-NEXT: str wzr, [sp, #692] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #99] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: str wzr, [sp, #716] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #98] +; NONEON-NOSVE-NEXT: str wzr, [sp, #708] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #97] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #96] +; NONEON-NOSVE-NEXT: str wzr, [sp, #732] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #103] +; NONEON-NOSVE-NEXT: str wzr, [sp, #724] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #102] +; NONEON-NOSVE-NEXT: str wzr, [sp, #748] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #101] +; NONEON-NOSVE-NEXT: str wzr, [sp, #740] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #100] +; NONEON-NOSVE-NEXT: str wzr, [sp, #636] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #194] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: str wzr, [sp, #628] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #192] +; NONEON-NOSVE-NEXT: str wzr, [sp, #652] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #198] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #196] +; NONEON-NOSVE-NEXT: str wzr, [sp, #644] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #202] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #272] +; NONEON-NOSVE-NEXT: str wzr, [sp, #668] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #200] +; NONEON-NOSVE-NEXT: str wzr, [sp, #660] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #206] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #400] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #204] +; NONEON-NOSVE-NEXT: str wzr, [sp, #684] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #176] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #288] +; NONEON-NOSVE-NEXT: str wzr, [sp, #676] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #182] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #180] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #186] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #416] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #240] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #184] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #190] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #368] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #188] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #226] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #256] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #230] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #384] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #228] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #234] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #336] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #232] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #238] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #464] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #236] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #352] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #214] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #480] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #212] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #218] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #304] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #216] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #222] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #432] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #220] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #404] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #320] +; NONEON-NOSVE-NEXT: str w8, [sp, #568] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #400] +; NONEON-NOSVE-NEXT: str w8, [sp, #560] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #412] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #448] +; NONEON-NOSVE-NEXT: str w8, [sp, #584] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #408] +; NONEON-NOSVE-NEXT: str w8, [sp, #576] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #420] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #560] +; NONEON-NOSVE-NEXT: str w8, [sp, #600] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #416] +; NONEON-NOSVE-NEXT: str w8, [sp, #592] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #428] +; NONEON-NOSVE-NEXT: str w8, [sp, #616] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #424] +; NONEON-NOSVE-NEXT: str w8, [sp, #608] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #372] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #592] +; NONEON-NOSVE-NEXT: str w8, [sp, #504] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #368] +; NONEON-NOSVE-NEXT: str w8, [sp, #496] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #380] +; NONEON-NOSVE-NEXT: str w8, [sp, #520] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #376] +; NONEON-NOSVE-NEXT: str w8, [sp, #512] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #388] +; NONEON-NOSVE-NEXT: ldp q4, q5, [sp, #496] +; NONEON-NOSVE-NEXT: str w8, [sp, #536] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #384] +; NONEON-NOSVE-NEXT: str w8, [sp, #528] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #396] +; NONEON-NOSVE-NEXT: str w8, [sp, #552] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #392] +; NONEON-NOSVE-NEXT: str w8, [sp, #544] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #468] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #528] +; NONEON-NOSVE-NEXT: str w8, [sp, #696] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #464] +; NONEON-NOSVE-NEXT: str w8, [sp, #688] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #476] +; NONEON-NOSVE-NEXT: str w8, [sp, #712] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #472] +; NONEON-NOSVE-NEXT: str w8, [sp, #704] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #484] +; NONEON-NOSVE-NEXT: ldp q16, q17, [sp, #688] +; NONEON-NOSVE-NEXT: str w8, [sp, #728] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #480] +; NONEON-NOSVE-NEXT: str w8, [sp, #720] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #492] +; NONEON-NOSVE-NEXT: str w8, [sp, #744] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #488] +; NONEON-NOSVE-NEXT: str w8, [sp, #736] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #436] +; NONEON-NOSVE-NEXT: ldp q19, q20, [sp, #720] +; NONEON-NOSVE-NEXT: str w8, [sp, #632] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #432] +; NONEON-NOSVE-NEXT: str w8, [sp, #624] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #444] +; NONEON-NOSVE-NEXT: str w8, [sp, #648] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #440] +; NONEON-NOSVE-NEXT: str w8, [sp, #640] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #452] +; NONEON-NOSVE-NEXT: ldp q22, q23, [sp, #624] +; NONEON-NOSVE-NEXT: str w8, [sp, #664] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #448] +; NONEON-NOSVE-NEXT: str w8, [sp, #656] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #460] +; NONEON-NOSVE-NEXT: str w8, [sp, #680] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #456] +; NONEON-NOSVE-NEXT: str w8, [sp, #672] +; NONEON-NOSVE-NEXT: ldp q21, q18, [sp, #656] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1, #32] +; NONEON-NOSVE-NEXT: stp q4, q5, [x1, #64] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #96] +; NONEON-NOSVE-NEXT: stp q16, q17, [x1, #128] +; NONEON-NOSVE-NEXT: stp q19, q20, [x1, #160] +; NONEON-NOSVE-NEXT: stp q22, q23, [x1, #192] +; NONEON-NOSVE-NEXT: stp q21, q18, [x1, #224] +; NONEON-NOSVE-NEXT: add sp, sp, #752 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -1387,13 +3616,25 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i16_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = zext <8 x i16> %a to <8 x i32> store <8 x i32>%b, ptr %out @@ -1418,20 +3659,91 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i16_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: add w13, w13, w13 +; NONEON-NOSVE-NEXT: add w14, w14, w14 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w14, [sp, #46] +; NONEON-NOSVE-NEXT: add w14, w3, w3 +; NONEON-NOSVE-NEXT: strh w13, [sp, #44] +; NONEON-NOSVE-NEXT: add w13, w5, w5 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w14, [sp, #42] +; NONEON-NOSVE-NEXT: add w14, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w13, [sp, #40] +; NONEON-NOSVE-NEXT: add w13, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #22] +; NONEON-NOSVE-NEXT: strh w14, [sp, #38] +; NONEON-NOSVE-NEXT: add w14, w0, w0 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w13, [sp, #36] +; NONEON-NOSVE-NEXT: add w13, w18, w18 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w14, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #30] +; NONEON-NOSVE-NEXT: strh w13, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #18] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #20] +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: add w14, w17, w17 +; NONEON-NOSVE-NEXT: add w12, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w13, w16, w16 +; NONEON-NOSVE-NEXT: add w11, w11, w11 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: add w10, w10, w10 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #76] +; NONEON-NOSVE-NEXT: strh w14, [sp, #62] +; NONEON-NOSVE-NEXT: add w14, w15, w15 +; NONEON-NOSVE-NEXT: strh w13, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #72] +; NONEON-NOSVE-NEXT: strh w14, [sp, #58] +; NONEON-NOSVE-NEXT: strh w12, [sp, #56] +; NONEON-NOSVE-NEXT: strh w11, [sp, #54] +; NONEON-NOSVE-NEXT: strh w10, [sp, #52] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a @@ -1457,14 +3769,26 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v4i16_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = zext <4 x i16> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -1490,21 +3814,43 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i16_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp, #-160]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #144] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #128] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %b = zext <8 x i16> %a to <8 x i64> store <8 x i64>%b, ptr %out @@ -1543,36 +3889,144 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i16_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: ushll v0.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: ushll v1.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: sub sp, sp, #368 +; NONEON-NOSVE-NEXT: str x29, [sp, #352] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 368 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str wzr, [sp, #268] +; NONEON-NOSVE-NEXT: str wzr, [sp, #260] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #352] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: str wzr, [sp, #284] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: add w13, w13, w13 +; NONEON-NOSVE-NEXT: add w14, w14, w14 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w14, [sp, #54] +; NONEON-NOSVE-NEXT: add w14, w3, w3 +; NONEON-NOSVE-NEXT: strh w13, [sp, #52] +; NONEON-NOSVE-NEXT: add w13, w5, w5 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w14, [sp, #50] +; NONEON-NOSVE-NEXT: add w14, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w13, [sp, #48] +; NONEON-NOSVE-NEXT: add w13, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #22] +; NONEON-NOSVE-NEXT: strh w14, [sp, #46] +; NONEON-NOSVE-NEXT: add w14, w0, w0 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w13, [sp, #44] +; NONEON-NOSVE-NEXT: add w13, w18, w18 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w14, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #30] +; NONEON-NOSVE-NEXT: strh w13, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #18] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: add w14, w17, w17 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #20] +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: add w12, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w11, w11, w11 +; NONEON-NOSVE-NEXT: add w10, w10, w10 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: add w13, w16, w16 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w14, [sp, #70] +; NONEON-NOSVE-NEXT: add w14, w15, w15 +; NONEON-NOSVE-NEXT: strh w13, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w14, [sp, #66] +; NONEON-NOSVE-NEXT: strh w12, [sp, #64] +; NONEON-NOSVE-NEXT: strh w11, [sp, #62] +; NONEON-NOSVE-NEXT: strh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #72] +; NONEON-NOSVE-NEXT: str wzr, [sp, #276] +; NONEON-NOSVE-NEXT: str wzr, [sp, #332] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #76] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #98] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #96] +; NONEON-NOSVE-NEXT: str wzr, [sp, #324] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #102] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #100] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #184] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #88] +; NONEON-NOSVE-NEXT: str wzr, [sp, #348] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: str d0, [sp, #360] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: str wzr, [sp, #340] +; NONEON-NOSVE-NEXT: str w9, [sp, #264] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #252] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #192] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: str d2, [sp, #200] +; NONEON-NOSVE-NEXT: str w9, [sp, #280] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: str wzr, [sp, #300] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #256] +; NONEON-NOSVE-NEXT: str wzr, [sp, #292] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #224] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: str wzr, [sp, #316] +; NONEON-NOSVE-NEXT: str wzr, [sp, #308] +; NONEON-NOSVE-NEXT: stp wzr, w9, [sp, #244] +; NONEON-NOSVE-NEXT: str w8, [sp, #240] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #216] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #364] +; NONEON-NOSVE-NEXT: str w9, [sp, #328] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #360] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #200] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #320] +; NONEON-NOSVE-NEXT: str w9, [sp, #296] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #208] +; NONEON-NOSVE-NEXT: str w9, [sp, #312] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #288] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #368 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a @@ -1597,13 +4051,19 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v4i32_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = zext <4 x i32> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -1628,20 +4088,47 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i32_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #16] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #96] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #144] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = add <8 x i32> %a, %a @@ -1672,17 +4159,17 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) { ; ; NONEON-NOSVE-LABEL: extend_and_mul: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v1.2s, w0 -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: fmov x11, d1 -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: mul x10, x11, x10 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: mov w9, w0 +; NONEON-NOSVE-NEXT: mul x10, x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] ; NONEON-NOSVE-NEXT: mul x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0 %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer @@ -1702,9 +4189,12 @@ define void @extend_no_mul(i32 %0, <2 x i64> %1, ptr %2) { ; ; NONEON-NOSVE-LABEL: extend_no_mul: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: dup v0.2s, w0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: mov w8, w0 +; NONEON-NOSVE-NEXT: stp x8, x8, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret entry: %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll index ade60b07150ce2..1f5bb5f5486af3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll @@ -26,11 +26,108 @@ define void @add_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: add_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 @@ -51,12 +148,60 @@ define void @add_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: add_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: add v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -77,12 +222,32 @@ define void @add_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: add_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: add w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: add w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -103,12 +268,22 @@ define void @add_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: add_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: add v1.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: add v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: add x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: add x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: add x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: add x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -133,11 +308,108 @@ define void @and_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: and_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 @@ -158,12 +430,60 @@ define void @and_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: and_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -184,12 +504,32 @@ define void @and_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: and_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: and w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: and w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -210,12 +550,22 @@ define void @and_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: and_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: and x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: and x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: and x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: and x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -240,10 +590,108 @@ define void @ashr_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: ashr_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: cmlt v1.16b, v1.16b, #0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 @@ -264,10 +712,60 @@ define void @ashr_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ashr_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v0.8h, v0.8h, #0 -; NONEON-NOSVE-NEXT: cmlt v1.8h, v1.8h, #0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -288,10 +786,32 @@ define void @ashr_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: ashr_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v0.4s, v0.4s, #0 -; NONEON-NOSVE-NEXT: cmlt v1.4s, v1.4s, #0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: asr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: asr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -312,10 +832,22 @@ define void @ashr_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: ashr_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v0.2d, v0.2d, #0 -; NONEON-NOSVE-NEXT: cmlt v1.2d, v1.2d, #0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: asr x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: asr x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -343,11 +875,140 @@ define void @icmp_eq_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cmeq v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -372,12 +1033,76 @@ define void @icmp_sge_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: icmp_sge_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: cmge v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: cmge v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -402,12 +1127,40 @@ define void @icmp_sgt_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: icmp_sgt_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #-8 // =0xfffffff8 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: cmgt v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: cmgt v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 -8, i64 0 @@ -432,12 +1185,26 @@ define void @icmp_ult_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: icmp_ult_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmhi v1.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmhi v0.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x9, lo +; NONEON-NOSVE-NEXT: cmp x8, #63 +; NONEON-NOSVE-NEXT: csetm x8, lo +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: csetm x9, lo +; NONEON-NOSVE-NEXT: cmp x8, #63 +; NONEON-NOSVE-NEXT: csetm x8, lo +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -463,10 +1230,108 @@ define void @lshr_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: lshr_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: ushr v1.16b, v1.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -487,10 +1352,60 @@ define void @lshr_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: lshr_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v0.8h, v0.8h, #15 -; NONEON-NOSVE-NEXT: ushr v1.8h, v1.8h, #15 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -511,10 +1426,32 @@ define void @lshr_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: lshr_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v0.4s, v0.4s, #31 -; NONEON-NOSVE-NEXT: ushr v1.4s, v1.4s, #31 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: lsr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -535,10 +1472,22 @@ define void @lshr_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: lshr_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v0.2d, v0.2d, #63 -; NONEON-NOSVE-NEXT: ushr v1.2d, v1.2d, #63 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: lsr x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: lsr x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -563,11 +1512,140 @@ define void @mul_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: mul_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: mul v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: mul v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -588,12 +1666,76 @@ define void @mul_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: mul_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: mul v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: mul v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -614,12 +1756,44 @@ define void @mul_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: mul_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: mul v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: mul v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -640,24 +1814,28 @@ define void @mul_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: mul_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: fmov x11, d1 -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: lsl x12, x10, #6 -; NONEON-NOSVE-NEXT: lsl x13, x11, #6 -; NONEON-NOSVE-NEXT: lsl x14, x8, #6 -; NONEON-NOSVE-NEXT: sub x10, x12, x10 -; NONEON-NOSVE-NEXT: sub x11, x13, x11 -; NONEON-NOSVE-NEXT: lsl x12, x9, #6 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: fmov d1, x11 -; NONEON-NOSVE-NEXT: sub x8, x14, x8 -; NONEON-NOSVE-NEXT: sub x9, x12, x9 -; NONEON-NOSVE-NEXT: mov v0.d[1], x8 -; NONEON-NOSVE-NEXT: mov v1.d[1], x9 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl x9, x8, #6 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl x9, x8, #6 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl x9, x8, #6 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: lsl x9, x8, #6 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -682,11 +1860,108 @@ define void @or_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: or_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -707,12 +1982,60 @@ define void @or_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: or_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -733,12 +2056,32 @@ define void @or_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: or_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -759,12 +2102,22 @@ define void @or_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: or_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: orr x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: orr x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: orr x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: orr x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -789,10 +2142,108 @@ define void @shl_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: shl_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: shl v1.16b, v1.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -813,10 +2264,60 @@ define void @shl_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: shl_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: shl v0.8h, v0.8h, #15 -; NONEON-NOSVE-NEXT: shl v1.8h, v1.8h, #15 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -837,10 +2338,32 @@ define void @shl_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: shl_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: shl v0.4s, v0.4s, #31 -; NONEON-NOSVE-NEXT: shl v1.4s, v1.4s, #31 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: lsl w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -861,10 +2384,22 @@ define void @shl_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: shl_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: shl v0.2d, v0.2d, #63 -; NONEON-NOSVE-NEXT: shl v1.2d, v1.2d, #63 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: lsl x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -889,11 +2424,141 @@ define void @smax_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: smax_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smax v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smax v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #7 // =0x7 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -914,12 +2579,77 @@ define void @smax_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: smax_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: smax v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smax v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -940,12 +2670,41 @@ define void @smax_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: smax_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: smax v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smax v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, gt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, gt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, gt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, gt +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -966,14 +2725,27 @@ define void @smax_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: smax_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmgt v3.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmgt v4.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, gt +; NONEON-NOSVE-NEXT: ldr x9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x9, x9, x8, gt +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, gt +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -998,11 +2770,141 @@ define void @smin_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: smin_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smin v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smin v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #7 // =0x7 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1023,12 +2925,77 @@ define void @smin_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: smin_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: smin v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smin v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1049,12 +3016,41 @@ define void @smin_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: smin_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: smin v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smin v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lt +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1075,14 +3071,27 @@ define void @smin_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: smin_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmgt v3.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmgt v4.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, lt +; NONEON-NOSVE-NEXT: ldr x9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x9, x9, x8, lt +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, lt +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -1107,11 +3116,108 @@ define void @sub_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: sub_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: sub v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1132,12 +3238,60 @@ define void @sub_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: sub_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: sub v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1158,12 +3312,32 @@ define void @sub_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: sub_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: sub v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: sub w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: sub w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: sub w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: sub w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1184,12 +3358,22 @@ define void @sub_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: sub_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: sub v1.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: sub v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sub x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: sub x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: sub x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: sub x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -1214,11 +3398,141 @@ define void @umax_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: umax_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umax v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umax v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #7 // =0x7 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #31] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #30] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #29] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #28] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #27] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #26] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #25] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #24] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #23] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #22] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #21] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #17] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1239,12 +3553,77 @@ define void @umax_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: umax_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: umax v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umax v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1265,12 +3644,41 @@ define void @umax_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: umax_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: umax v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umax v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, hi +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, hi +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, hi +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, hi +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1291,14 +3699,27 @@ define void @umax_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: umax_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmhi v3.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmhi v4.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, hi +; NONEON-NOSVE-NEXT: ldr x9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x9, x9, x8, hi +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, hi +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -1323,11 +3744,141 @@ define void @umin_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: umin_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umin v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umin v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #7 // =0x7 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1348,12 +3899,77 @@ define void @umin_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: umin_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: umin v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umin v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1374,12 +3990,41 @@ define void @umin_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: umin_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: umin v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umin v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lo +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lo +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lo +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lo +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1400,14 +4045,27 @@ define void @umin_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: umin_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmhi v3.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmhi v4.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, lo +; NONEON-NOSVE-NEXT: ldr x9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x9, x9, x8, lo +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, lo +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -1432,11 +4090,108 @@ define void @xor_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: xor_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1457,12 +4212,60 @@ define void @xor_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: xor_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1483,12 +4286,32 @@ define void @xor_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: xor_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: eor w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: eor w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1509,12 +4332,22 @@ define void @xor_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: xor_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: eor x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: eor x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: eor x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: eor x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll index 4fc7ec3a8439df..3137a7bc7ad270 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll @@ -20,7 +20,43 @@ define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: and_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = and <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -37,7 +73,74 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: and_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = and <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -55,11 +158,143 @@ define void @and_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: and_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -79,7 +314,27 @@ define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: and_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = and <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -96,7 +351,42 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: and_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = and <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -114,11 +404,79 @@ define void @and_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: and_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -138,7 +496,18 @@ define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: and_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = and <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -155,7 +524,24 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: and_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = and <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -173,11 +559,43 @@ define void @and_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: and_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -197,7 +615,14 @@ define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: and_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: and x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = and <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -214,7 +639,17 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: and_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: and x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: and x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = and <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -232,11 +667,29 @@ define void @and_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: and_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: and x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: and x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: and x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: and x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -260,7 +713,43 @@ define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: or_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = or <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -277,7 +766,74 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: or_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = or <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -295,11 +851,143 @@ define void @or_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: or_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -319,7 +1007,27 @@ define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: or_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = or <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -336,7 +1044,42 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: or_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = or <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -354,11 +1097,79 @@ define void @or_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: or_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -378,7 +1189,18 @@ define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: or_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = or <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -395,7 +1217,24 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: or_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = or <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -413,11 +1252,43 @@ define void @or_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: or_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -437,7 +1308,14 @@ define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: or_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: orr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = or <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -454,7 +1332,17 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: or_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: orr x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: orr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = or <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -472,11 +1360,29 @@ define void @or_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: or_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: orr x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: orr x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: orr x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: orr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -500,7 +1406,43 @@ define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = xor <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -517,7 +1459,74 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = xor <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -535,11 +1544,143 @@ define void @xor_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: xor_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -559,7 +1700,27 @@ define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = xor <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -576,7 +1737,42 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = xor <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -594,11 +1790,79 @@ define void @xor_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: xor_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -618,7 +1882,18 @@ define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = xor <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -635,7 +1910,24 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = xor <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -653,11 +1945,43 @@ define void @xor_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: xor_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -677,7 +2001,14 @@ define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: eor x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = xor <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -694,7 +2025,17 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: eor x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: eor x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = xor <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -712,11 +2053,29 @@ define void @xor_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: xor_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: eor x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: eor x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: eor x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: eor x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll index b9c859a58611e8..4775a965b70d77 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll @@ -21,7 +21,51 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res @@ -39,7 +83,90 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res @@ -59,11 +186,175 @@ define void @smax_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smax_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smax v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smax v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -84,7 +375,31 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res @@ -102,7 +417,50 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res @@ -122,11 +480,95 @@ define void @smax_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smax_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smax v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smax v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -147,7 +589,19 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res @@ -165,7 +619,26 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res @@ -185,11 +658,47 @@ define void @smax_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smax_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smax v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -211,8 +720,15 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmgt d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res @@ -231,8 +747,18 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmgt v2.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, gt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res @@ -252,14 +778,31 @@ define void @smax_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smax_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmgt v4.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmgt v5.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, gt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, gt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -284,7 +827,51 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res @@ -302,7 +889,90 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res @@ -322,11 +992,175 @@ define void @smin_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smin_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smin v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smin v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -347,7 +1181,31 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res @@ -365,7 +1223,50 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res @@ -385,11 +1286,95 @@ define void @smin_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smin_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smin v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smin v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -410,7 +1395,19 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res @@ -428,7 +1425,26 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res @@ -448,11 +1464,47 @@ define void @smin_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smin_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smin v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -474,8 +1526,15 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmgt d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res @@ -494,8 +1553,18 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res @@ -515,14 +1584,31 @@ define void @smin_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smin_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmgt v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmgt v5.2d, v3.2d, v2.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -547,7 +1633,51 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res @@ -565,7 +1695,90 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res @@ -585,11 +1798,175 @@ define void @umax_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umax_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umax v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umax v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -610,7 +1987,31 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res @@ -628,7 +2029,50 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res @@ -648,11 +2092,95 @@ define void @umax_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umax_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umax v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umax v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -673,7 +2201,19 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res @@ -691,7 +2231,26 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res @@ -711,11 +2270,47 @@ define void @umax_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umax_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umax v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -737,8 +2332,15 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmhi d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res @@ -757,8 +2359,18 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmhi v2.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, hi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res @@ -778,14 +2390,31 @@ define void @umax_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umax_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmhi v4.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmhi v5.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, hi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, hi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -810,7 +2439,51 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res @@ -828,7 +2501,90 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res @@ -848,11 +2604,175 @@ define void @umin_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umin_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umin v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umin v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -873,7 +2793,31 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res @@ -891,7 +2835,50 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res @@ -911,11 +2898,95 @@ define void @umin_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umin_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umin v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umin v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -936,7 +3007,19 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res @@ -954,7 +3037,26 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res @@ -974,11 +3076,47 @@ define void @umin_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umin_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umin v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -1000,8 +3138,15 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmhi d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res @@ -1020,8 +3165,18 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lo +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res @@ -1041,14 +3196,31 @@ define void @umin_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umin_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmhi v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmhi v5.2d, v3.2d, v2.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lo +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lo +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll index 3a03de3442d581..94d5bb1543b0e0 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll @@ -24,8 +24,51 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { ; ; NONEON-NOSVE-LABEL: mla8xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mla v2.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #7] +; NONEON-NOSVE-NEXT: str d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #6] +; NONEON-NOSVE-NEXT: madd w1, w2, w1, w5 +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: strb w1, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #9] +; NONEON-NOSVE-NEXT: madd w1, w4, w3, w1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w1, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #21] +; NONEON-NOSVE-NEXT: madd w18, w0, w18, w1 +; NONEON-NOSVE-NEXT: strb w18, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #20] +; NONEON-NOSVE-NEXT: madd w16, w17, w16, w18 +; NONEON-NOSVE-NEXT: strb w16, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #19] +; NONEON-NOSVE-NEXT: madd w14, w15, w14, w16 +; NONEON-NOSVE-NEXT: strb w14, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #18] +; NONEON-NOSVE-NEXT: madd w12, w13, w12, w14 +; NONEON-NOSVE-NEXT: strb w12, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #17] +; NONEON-NOSVE-NEXT: madd w10, w11, w10, w12 +; NONEON-NOSVE-NEXT: strb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: madd w8, w9, w8, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %tmp1 = mul <8 x i8> %A, %B; %tmp2 = add <8 x i8> %C, %tmp1; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll index 1ed3d8fa39d8da..6198926c0b4381 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -40,12 +40,31 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: ushr v0.4h, v0.4h, #4 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w8, w12 +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #16] +; NONEON-NOSVE-NEXT: mul w9, w9, w13 +; NONEON-NOSVE-NEXT: mul w10, w10, w14 +; NONEON-NOSVE-NEXT: mul w11, w11, w12 +; NONEON-NOSVE-NEXT: ubfx w8, w8, #4, #12 +; NONEON-NOSVE-NEXT: ubfx w9, w9, #4, #12 +; NONEON-NOSVE-NEXT: ubfx w10, w10, #4, #12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w8, w11, #4, #12 +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i16> undef, i16 4, i64 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer @@ -77,8 +96,51 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: shrn v0.8b, v0.8h, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #15] +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #13] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w17, [sp, #22] +; NONEON-NOSVE-NEXT: mul w15, w15, w16 +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: mul w14, w14, w17 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: mul w13, w13, w16 +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #11] +; NONEON-NOSVE-NEXT: ldrsb w17, [sp, #16] +; NONEON-NOSVE-NEXT: mul w12, w12, w18 +; NONEON-NOSVE-NEXT: lsr w15, w15, #8 +; NONEON-NOSVE-NEXT: ldrsb w0, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w14, w14, #8 +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w8, w17 +; NONEON-NOSVE-NEXT: lsr w13, w13, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w0 +; NONEON-NOSVE-NEXT: lsr w12, w12, #8 +; NONEON-NOSVE-NEXT: strb w15, [sp, #31] +; NONEON-NOSVE-NEXT: mul w10, w10, w16 +; NONEON-NOSVE-NEXT: strb w14, [sp, #30] +; NONEON-NOSVE-NEXT: mul w9, w9, w18 +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: strb w13, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w11, w11, #8 +; NONEON-NOSVE-NEXT: strb w12, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w10, w10, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: strb w11, [sp, #27] +; NONEON-NOSVE-NEXT: strb w10, [sp, #26] +; NONEON-NOSVE-NEXT: strb w9, [sp, #25] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i16> undef, i16 8, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer @@ -110,9 +172,116 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull2 v2.8h, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: smull v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: str x27, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #96] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #112] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w6, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w7, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w19, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w20, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #49] +; NONEON-NOSVE-NEXT: str d0, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w21, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w23, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w25, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w26, [sp, #60] +; NONEON-NOSVE-NEXT: str d1, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #52] +; NONEON-NOSVE-NEXT: mul w20, w20, w21 +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #54] +; NONEON-NOSVE-NEXT: mul w19, w19, w23 +; NONEON-NOSVE-NEXT: ldrsb w17, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w0, [sp, #40] +; NONEON-NOSVE-NEXT: mul w7, w7, w25 +; NONEON-NOSVE-NEXT: ldrsb w2, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w3, [sp, #42] +; NONEON-NOSVE-NEXT: mul w6, w6, w26 +; NONEON-NOSVE-NEXT: lsr w20, w20, #8 +; NONEON-NOSVE-NEXT: ldrsb w4, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #88] +; NONEON-NOSVE-NEXT: lsr w19, w19, #8 +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #90] +; NONEON-NOSVE-NEXT: lsr w7, w7, #8 +; NONEON-NOSVE-NEXT: ldrsb w1, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w5, [sp, #92] +; NONEON-NOSVE-NEXT: mul w9, w9, w16 +; NONEON-NOSVE-NEXT: lsr w6, w6, #8 +; NONEON-NOSVE-NEXT: ldrsb w22, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w24, [sp, #94] +; NONEON-NOSVE-NEXT: mul w11, w11, w1 +; NONEON-NOSVE-NEXT: ldrsb w21, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w23, [sp, #56] +; NONEON-NOSVE-NEXT: mul w12, w12, w5 +; NONEON-NOSVE-NEXT: ldrsb w27, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w25, [sp, #58] +; NONEON-NOSVE-NEXT: mul w15, w15, w24 +; NONEON-NOSVE-NEXT: ldrsb w26, [sp, #57] +; NONEON-NOSVE-NEXT: mul w0, w0, w23 +; NONEON-NOSVE-NEXT: lsr w11, w11, #8 +; NONEON-NOSVE-NEXT: mul w4, w4, w27 +; NONEON-NOSVE-NEXT: lsr w12, w12, #8 +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: mul w3, w3, w25 +; NONEON-NOSVE-NEXT: lsr w15, w15, #8 +; NONEON-NOSVE-NEXT: strb w20, [sp, #79] +; NONEON-NOSVE-NEXT: mul w2, w2, w26 +; NONEON-NOSVE-NEXT: lsr w0, w0, #8 +; NONEON-NOSVE-NEXT: strb w19, [sp, #78] +; NONEON-NOSVE-NEXT: mul w17, w17, w21 +; NONEON-NOSVE-NEXT: lsr w4, w4, #8 +; NONEON-NOSVE-NEXT: strb w7, [sp, #77] +; NONEON-NOSVE-NEXT: mul w13, w13, w22 +; NONEON-NOSVE-NEXT: lsr w3, w3, #8 +; NONEON-NOSVE-NEXT: strb w6, [sp, #76] +; NONEON-NOSVE-NEXT: mul w10, w10, w18 +; NONEON-NOSVE-NEXT: lsr w2, w2, #8 +; NONEON-NOSVE-NEXT: strb w4, [sp, #75] +; NONEON-NOSVE-NEXT: mul w8, w8, w14 +; NONEON-NOSVE-NEXT: lsr w17, w17, #8 +; NONEON-NOSVE-NEXT: strb w3, [sp, #74] +; NONEON-NOSVE-NEXT: lsr w13, w13, #8 +; NONEON-NOSVE-NEXT: strb w2, [sp, #73] +; NONEON-NOSVE-NEXT: ldr x27, [sp, #80] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w10, w10, #8 +; NONEON-NOSVE-NEXT: strb w0, [sp, #72] +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: strb w17, [sp, #71] +; NONEON-NOSVE-NEXT: strb w15, [sp, #70] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w13, [sp, #69] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w12, [sp, #68] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #112] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w11, [sp, #67] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #96] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w10, [sp, #66] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %1 = sext <16 x i8> %op1 to <16 x i16> %2 = sext <16 x i8> %op2 to <16 x i16> @@ -145,15 +314,251 @@ define void @smulh_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smulh_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smull2 v4.8h, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smull v0.8h, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: smull2 v1.8h, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: smull v2.8h, v2.8b, v3.8b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v4.16b -; NONEON-NOSVE-NEXT: uzp2 v1.16b, v2.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #384 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #288] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #304] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #320] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #336] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 384 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov x29, x0 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: str q1, [sp, #160] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: str q3, [sp, #144] +; NONEON-NOSVE-NEXT: str q2, [sp, #192] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #185] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #186] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #187] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #188] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #189] +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #229] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #227] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #228] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #190] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #191] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #177] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #226] +; NONEON-NOSVE-NEXT: ldrsb w2, [sp, #214] +; NONEON-NOSVE-NEXT: ldrsb w1, [sp, #215] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #178] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #179] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: ldrsb w4, [sp, #212] +; NONEON-NOSVE-NEXT: ldrsb w3, [sp, #213] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #180] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #181] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #247] +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #246] +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #244] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #182] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #183] +; NONEON-NOSVE-NEXT: mul w26, w12, w16 +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #242] +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #250] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #232] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #233] +; NONEON-NOSVE-NEXT: mul w30, w10, w12 +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #255] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #253] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #234] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #235] +; NONEON-NOSVE-NEXT: ldrsb w0, [sp, #248] +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #249] +; NONEON-NOSVE-NEXT: ldrsb w6, [sp, #210] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #236] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #237] +; NONEON-NOSVE-NEXT: ldrsb w5, [sp, #211] +; NONEON-NOSVE-NEXT: ldrsb w19, [sp, #208] +; NONEON-NOSVE-NEXT: ldrsb w7, [sp, #209] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #238] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #239] +; NONEON-NOSVE-NEXT: ldrsb w21, [sp, #222] +; NONEON-NOSVE-NEXT: ldrsb w20, [sp, #223] +; NONEON-NOSVE-NEXT: ldrsb w23, [sp, #220] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #224] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #225] +; NONEON-NOSVE-NEXT: ldrsb w22, [sp, #221] +; NONEON-NOSVE-NEXT: ldrsb w24, [sp, #219] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #230] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #231] +; NONEON-NOSVE-NEXT: mul w27, w8, w14 +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #245] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #217] +; NONEON-NOSVE-NEXT: mul w9, w9, w15 +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #251] +; NONEON-NOSVE-NEXT: mul w25, w13, w14 +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #243] +; NONEON-NOSVE-NEXT: lsr w14, w27, #8 +; NONEON-NOSVE-NEXT: ldrsb w27, [sp, #218] +; NONEON-NOSVE-NEXT: lsr w17, w9, #8 +; NONEON-NOSVE-NEXT: mul w28, w11, w13 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #216] +; NONEON-NOSVE-NEXT: strb w14, [sp, #287] +; NONEON-NOSVE-NEXT: lsr w14, w25, #8 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #241] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #240] +; NONEON-NOSVE-NEXT: strb w14, [sp, #285] +; NONEON-NOSVE-NEXT: lsr w14, w28, #8 +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #254] +; NONEON-NOSVE-NEXT: mul w8, w25, w8 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #252] +; NONEON-NOSVE-NEXT: strb w14, [sp, #283] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w9, w25, w9 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w17, [sp, #286] +; NONEON-NOSVE-NEXT: mul w12, w14, w12 +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: lsr w17, w26, #8 +; NONEON-NOSVE-NEXT: mul w10, w25, w10 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w14, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #281] +; NONEON-NOSVE-NEXT: mul w11, w25, w11 +; NONEON-NOSVE-NEXT: strb w17, [sp, #284] +; NONEON-NOSVE-NEXT: lsr w17, w30, #8 +; NONEON-NOSVE-NEXT: mul w13, w14, w13 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #280] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #320] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: mul w10, w10, w15 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #279] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w11, w11, w16 +; NONEON-NOSVE-NEXT: strb w9, [sp, #278] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: mul w12, w12, w18 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #277] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #276] +; NONEON-NOSVE-NEXT: mul w13, w13, w0 +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w10, w10, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #275] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w2 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #274] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #273] +; NONEON-NOSVE-NEXT: mul w12, w12, w3 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w13, w13, w4 +; NONEON-NOSVE-NEXT: strb w9, [sp, #272] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: mul w10, w10, w5 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #271] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #270] +; NONEON-NOSVE-NEXT: mul w11, w11, w6 +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w12, w12, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #269] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: mul w13, w13, w19 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #268] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #267] +; NONEON-NOSVE-NEXT: mul w10, w10, w20 +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w11, w11, w21 +; NONEON-NOSVE-NEXT: strb w9, [sp, #266] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w12, w12, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #265] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #264] +; NONEON-NOSVE-NEXT: mul w13, w13, w23 +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w15, w14, [sp, #16] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w10, w10, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #263] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w27 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #262] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #261] +; NONEON-NOSVE-NEXT: mul w12, w12, w15 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: strb w17, [sp, #282] +; NONEON-NOSVE-NEXT: mul w13, w13, w14 +; NONEON-NOSVE-NEXT: strb w9, [sp, #260] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #259] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: strb w9, [sp, #258] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #257] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #336] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #256] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #304] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #256] +; NONEON-NOSVE-NEXT: stp q0, q1, [x29] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #288] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #384 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -193,12 +598,20 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w8, w10 +; NONEON-NOSVE-NEXT: mul w9, w9, w11 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = sext <2 x i16> %op1 to <2 x i32> %2 = sext <2 x i16> %op2 to <2 x i32> @@ -228,8 +641,31 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull v0.4s, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: shrn v0.4h, v0.4s, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w12, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w13, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w14, [sp, #18] +; NONEON-NOSVE-NEXT: mul w11, w11, w12 +; NONEON-NOSVE-NEXT: ldrsh w12, [sp, #16] +; NONEON-NOSVE-NEXT: mul w10, w10, w13 +; NONEON-NOSVE-NEXT: mul w9, w9, w14 +; NONEON-NOSVE-NEXT: mul w8, w8, w12 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: strh w11, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = sext <4 x i16> %op1 to <4 x i32> %2 = sext <4 x i16> %op2 to <4 x i32> @@ -259,9 +695,54 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull2 v2.4s, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: smull v0.4s, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w15, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w12, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w13, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w14, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w16, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w17, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w18, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w0, [sp, #62] +; NONEON-NOSVE-NEXT: mul w15, w15, w16 +; NONEON-NOSVE-NEXT: ldrsh w16, [sp, #48] +; NONEON-NOSVE-NEXT: mul w14, w14, w17 +; NONEON-NOSVE-NEXT: ldrsh w17, [sp, #56] +; NONEON-NOSVE-NEXT: mul w13, w13, w18 +; NONEON-NOSVE-NEXT: ldrsh w18, [sp, #60] +; NONEON-NOSVE-NEXT: mul w12, w12, w16 +; NONEON-NOSVE-NEXT: ldrsh w16, [sp, #58] +; NONEON-NOSVE-NEXT: lsr w15, w15, #16 +; NONEON-NOSVE-NEXT: mul w11, w11, w0 +; NONEON-NOSVE-NEXT: lsr w14, w14, #16 +; NONEON-NOSVE-NEXT: mul w10, w10, w18 +; NONEON-NOSVE-NEXT: lsr w13, w13, #16 +; NONEON-NOSVE-NEXT: strh w15, [sp, #78] +; NONEON-NOSVE-NEXT: mul w9, w9, w16 +; NONEON-NOSVE-NEXT: lsr w12, w12, #16 +; NONEON-NOSVE-NEXT: strh w14, [sp, #76] +; NONEON-NOSVE-NEXT: mul w8, w8, w17 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: strh w13, [sp, #74] +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: strh w12, [sp, #72] +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: strh w11, [sp, #70] +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w10, [sp, #68] +; NONEON-NOSVE-NEXT: strh w9, [sp, #66] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %1 = sext <8 x i16> %op1 to <8 x i32> %2 = sext <8 x i16> %op2 to <8 x i32> @@ -294,15 +775,125 @@ define void @smulh_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smulh_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smull2 v4.4s, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smull v0.4s, v1.4h, v0.4h -; NONEON-NOSVE-NEXT: smull2 v1.4s, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: smull v2.4s, v2.4h, v3.4h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp2 v1.8h, v2.8h, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #240 +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 240 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str q3, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w12, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w13, [sp, #50] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w7, [sp, #96] +; NONEON-NOSVE-NEXT: ldrsh w19, [sp, #98] +; NONEON-NOSVE-NEXT: ldrsh w20, [sp, #100] +; NONEON-NOSVE-NEXT: ldrsh w21, [sp, #102] +; NONEON-NOSVE-NEXT: ldrsh w14, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w16, [sp, #54] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w18, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w2, [sp, #106] +; NONEON-NOSVE-NEXT: ldrsh w4, [sp, #108] +; NONEON-NOSVE-NEXT: ldrsh w5, [sp, #110] +; NONEON-NOSVE-NEXT: ldrsh w15, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w17, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w1, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w3, [sp, #94] +; NONEON-NOSVE-NEXT: mul w8, w8, w15 +; NONEON-NOSVE-NEXT: ldrsh w6, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w23, [sp, #82] +; NONEON-NOSVE-NEXT: mul w11, w11, w3 +; NONEON-NOSVE-NEXT: ldrsh w25, [sp, #84] +; NONEON-NOSVE-NEXT: mul w13, w13, w23 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: mul w14, w14, w25 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: mul w12, w12, w6 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: mul w10, w10, w1 +; NONEON-NOSVE-NEXT: lsr w13, w13, #16 +; NONEON-NOSVE-NEXT: ldrsh w22, [sp, #118] +; NONEON-NOSVE-NEXT: ldrsh w24, [sp, #116] +; NONEON-NOSVE-NEXT: ldrsh w26, [sp, #114] +; NONEON-NOSVE-NEXT: ldrsh w27, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w28, [sp, #126] +; NONEON-NOSVE-NEXT: mul w9, w9, w17 +; NONEON-NOSVE-NEXT: mul w21, w21, w22 +; NONEON-NOSVE-NEXT: ldrsh w22, [sp, #86] +; NONEON-NOSVE-NEXT: lsr w14, w14, #16 +; NONEON-NOSVE-NEXT: mul w20, w20, w24 +; NONEON-NOSVE-NEXT: ldrsh w24, [sp, #120] +; NONEON-NOSVE-NEXT: lsr w12, w12, #16 +; NONEON-NOSVE-NEXT: mul w19, w19, w26 +; NONEON-NOSVE-NEXT: ldrsh w26, [sp, #124] +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: mul w7, w7, w27 +; NONEON-NOSVE-NEXT: ldrsh w27, [sp, #122] +; NONEON-NOSVE-NEXT: lsr w21, w21, #16 +; NONEON-NOSVE-NEXT: mul w5, w5, w28 +; NONEON-NOSVE-NEXT: lsr w20, w20, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: mul w4, w4, w26 +; NONEON-NOSVE-NEXT: lsr w19, w19, #16 +; NONEON-NOSVE-NEXT: strh w21, [sp, #158] +; NONEON-NOSVE-NEXT: mul w2, w2, w27 +; NONEON-NOSVE-NEXT: lsr w7, w7, #16 +; NONEON-NOSVE-NEXT: strh w20, [sp, #156] +; NONEON-NOSVE-NEXT: mul w18, w18, w24 +; NONEON-NOSVE-NEXT: lsr w5, w5, #16 +; NONEON-NOSVE-NEXT: strh w19, [sp, #154] +; NONEON-NOSVE-NEXT: mul w16, w16, w22 +; NONEON-NOSVE-NEXT: lsr w4, w4, #16 +; NONEON-NOSVE-NEXT: strh w7, [sp, #152] +; NONEON-NOSVE-NEXT: lsr w2, w2, #16 +; NONEON-NOSVE-NEXT: strh w5, [sp, #150] +; NONEON-NOSVE-NEXT: lsr w18, w18, #16 +; NONEON-NOSVE-NEXT: strh w4, [sp, #148] +; NONEON-NOSVE-NEXT: lsr w16, w16, #16 +; NONEON-NOSVE-NEXT: strh w2, [sp, #146] +; NONEON-NOSVE-NEXT: strh w18, [sp, #144] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w16, [sp, #142] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w14, [sp, #140] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w13, [sp, #138] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w12, [sp, #136] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w11, [sp, #134] +; NONEON-NOSVE-NEXT: strh w10, [sp, #132] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #240 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -335,8 +926,18 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull v0.2d, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: shrn v0.2s, v0.2d, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #8] +; NONEON-NOSVE-NEXT: ldpsw x11, x10, [sp, #16] +; NONEON-NOSVE-NEXT: smull x9, w9, w10 +; NONEON-NOSVE-NEXT: smull x8, w8, w11 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = sext <2 x i32> %op1 to <2 x i64> %2 = sext <2 x i32> %op2 to <2 x i64> @@ -366,9 +967,28 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull2 v2.2d, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: smull v0.2d, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #40] +; NONEON-NOSVE-NEXT: ldpsw x10, x11, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldpsw x13, x12, [sp, #48] +; NONEON-NOSVE-NEXT: smull x11, w11, w12 +; NONEON-NOSVE-NEXT: ldpsw x12, x14, [sp, #56] +; NONEON-NOSVE-NEXT: smull x10, w10, w13 +; NONEON-NOSVE-NEXT: lsr x11, x11, #32 +; NONEON-NOSVE-NEXT: smull x9, w9, w14 +; NONEON-NOSVE-NEXT: smull x8, w8, w12 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #72] +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %1 = sext <4 x i32> %op1 to <4 x i64> %2 = sext <4 x i32> %op2 to <4 x i64> @@ -401,15 +1021,52 @@ define void @smulh_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smulh_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smull2 v4.2d, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smull v0.2d, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: smull2 v1.2d, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: smull v2.2d, v2.2s, v3.2s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp2 v1.4s, v2.4s, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #56] +; NONEON-NOSVE-NEXT: ldpsw x10, x11, [sp, #48] +; NONEON-NOSVE-NEXT: ldpsw x12, x13, [sp, #104] +; NONEON-NOSVE-NEXT: ldpsw x14, x15, [sp, #96] +; NONEON-NOSVE-NEXT: str q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldpsw x17, x16, [sp, #112] +; NONEON-NOSVE-NEXT: smull x15, w15, w16 +; NONEON-NOSVE-NEXT: ldpsw x16, x18, [sp, #120] +; NONEON-NOSVE-NEXT: smull x14, w14, w17 +; NONEON-NOSVE-NEXT: ldpsw x17, x1, [sp, #80] +; NONEON-NOSVE-NEXT: smull x13, w13, w18 +; NONEON-NOSVE-NEXT: lsr x15, x15, #32 +; NONEON-NOSVE-NEXT: smull x12, w12, w16 +; NONEON-NOSVE-NEXT: lsr x14, x14, #32 +; NONEON-NOSVE-NEXT: ldpsw x16, x18, [sp, #88] +; NONEON-NOSVE-NEXT: smull x11, w11, w1 +; NONEON-NOSVE-NEXT: lsr x13, x13, #32 +; NONEON-NOSVE-NEXT: stp w14, w15, [sp, #152] +; NONEON-NOSVE-NEXT: smull x10, w10, w17 +; NONEON-NOSVE-NEXT: lsr x12, x12, #32 +; NONEON-NOSVE-NEXT: smull x9, w9, w18 +; NONEON-NOSVE-NEXT: smull x8, w8, w16 +; NONEON-NOSVE-NEXT: lsr x11, x11, #32 +; NONEON-NOSVE-NEXT: stp w12, w13, [sp, #144] +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -442,12 +1099,14 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d0 ; NONEON-NOSVE-NEXT: fmov x9, d1 ; NONEON-NOSVE-NEXT: smulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <1 x i128> undef, i128 64, i128 0 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer @@ -479,15 +1138,17 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: fmov x11, d1 -; NONEON-NOSVE-NEXT: smulh x10, x10, x11 -; NONEON-NOSVE-NEXT: smulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp, #16] +; NONEON-NOSVE-NEXT: smulh x8, x8, x10 +; NONEON-NOSVE-NEXT: smulh x9, x9, x11 +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %1 = sext <2 x i64> %op1 to <2 x i128> %2 = sext <2 x i64> %op2 to <2 x i128> @@ -520,27 +1181,29 @@ define void @smulh_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smulh_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x11, v0.d[1] -; NONEON-NOSVE-NEXT: mov x14, v3.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x10, v1.d[1] -; NONEON-NOSVE-NEXT: mov x13, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x12, d3 -; NONEON-NOSVE-NEXT: smulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov x9, d2 -; NONEON-NOSVE-NEXT: smulh x10, x10, x11 -; NONEON-NOSVE-NEXT: smulh x9, x9, x12 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: smulh x11, x13, x14 -; NONEON-NOSVE-NEXT: fmov d1, x10 -; NONEON-NOSVE-NEXT: fmov d2, x9 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] -; NONEON-NOSVE-NEXT: fmov d3, x11 -; NONEON-NOSVE-NEXT: mov v2.d[1], v3.d[0] -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x13, x12, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: smulh x10, x10, x12 +; NONEON-NOSVE-NEXT: ldp x14, x12, [sp, #48] +; NONEON-NOSVE-NEXT: smulh x11, x11, x13 +; NONEON-NOSVE-NEXT: smulh x8, x8, x12 +; NONEON-NOSVE-NEXT: smulh x9, x9, x14 +; NONEON-NOSVE-NEXT: stp x11, x10, [sp, #64] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -583,11 +1246,31 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: ushr v0.4h, v0.4h, #4 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w8, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #16] +; NONEON-NOSVE-NEXT: mul w9, w9, w13 +; NONEON-NOSVE-NEXT: mul w10, w10, w14 +; NONEON-NOSVE-NEXT: mul w11, w11, w12 +; NONEON-NOSVE-NEXT: lsr w8, w8, #4 +; NONEON-NOSVE-NEXT: lsr w9, w9, #4 +; NONEON-NOSVE-NEXT: lsr w10, w10, #4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w11, #4 +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <4 x i8> %op1 to <4 x i16> %2 = zext <4 x i8> %op2 to <4 x i16> @@ -617,8 +1300,51 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: shrn v0.8b, v0.8h, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #22] +; NONEON-NOSVE-NEXT: mul w15, w15, w16 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: mul w14, w14, w17 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: mul w13, w13, w16 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #16] +; NONEON-NOSVE-NEXT: mul w12, w12, w18 +; NONEON-NOSVE-NEXT: lsr w15, w15, #8 +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w14, w14, #8 +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w8, w17 +; NONEON-NOSVE-NEXT: lsr w13, w13, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w0 +; NONEON-NOSVE-NEXT: lsr w12, w12, #8 +; NONEON-NOSVE-NEXT: strb w15, [sp, #31] +; NONEON-NOSVE-NEXT: mul w10, w10, w16 +; NONEON-NOSVE-NEXT: strb w14, [sp, #30] +; NONEON-NOSVE-NEXT: mul w9, w9, w18 +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: strb w13, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w11, w11, #8 +; NONEON-NOSVE-NEXT: strb w12, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w10, w10, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: strb w11, [sp, #27] +; NONEON-NOSVE-NEXT: strb w10, [sp, #26] +; NONEON-NOSVE-NEXT: strb w9, [sp, #25] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <8 x i8> %op1 to <8 x i16> %2 = zext <8 x i8> %op2 to <8 x i16> @@ -648,9 +1374,116 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull2 v2.8h, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: umull v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: str x27, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #96] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #112] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #49] +; NONEON-NOSVE-NEXT: str d0, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #60] +; NONEON-NOSVE-NEXT: str d1, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #52] +; NONEON-NOSVE-NEXT: mul w20, w20, w21 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #54] +; NONEON-NOSVE-NEXT: mul w19, w19, w23 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #40] +; NONEON-NOSVE-NEXT: mul w7, w7, w25 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #42] +; NONEON-NOSVE-NEXT: mul w6, w6, w26 +; NONEON-NOSVE-NEXT: lsr w20, w20, #8 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #88] +; NONEON-NOSVE-NEXT: lsr w19, w19, #8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #90] +; NONEON-NOSVE-NEXT: lsr w7, w7, #8 +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #92] +; NONEON-NOSVE-NEXT: mul w9, w9, w16 +; NONEON-NOSVE-NEXT: lsr w6, w6, #8 +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #94] +; NONEON-NOSVE-NEXT: mul w11, w11, w1 +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #56] +; NONEON-NOSVE-NEXT: mul w12, w12, w5 +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #58] +; NONEON-NOSVE-NEXT: mul w15, w15, w24 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #57] +; NONEON-NOSVE-NEXT: mul w0, w0, w23 +; NONEON-NOSVE-NEXT: lsr w11, w11, #8 +; NONEON-NOSVE-NEXT: mul w4, w4, w27 +; NONEON-NOSVE-NEXT: lsr w12, w12, #8 +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: mul w3, w3, w25 +; NONEON-NOSVE-NEXT: lsr w15, w15, #8 +; NONEON-NOSVE-NEXT: strb w20, [sp, #79] +; NONEON-NOSVE-NEXT: mul w2, w2, w26 +; NONEON-NOSVE-NEXT: lsr w0, w0, #8 +; NONEON-NOSVE-NEXT: strb w19, [sp, #78] +; NONEON-NOSVE-NEXT: mul w17, w17, w21 +; NONEON-NOSVE-NEXT: lsr w4, w4, #8 +; NONEON-NOSVE-NEXT: strb w7, [sp, #77] +; NONEON-NOSVE-NEXT: mul w13, w13, w22 +; NONEON-NOSVE-NEXT: lsr w3, w3, #8 +; NONEON-NOSVE-NEXT: strb w6, [sp, #76] +; NONEON-NOSVE-NEXT: mul w10, w10, w18 +; NONEON-NOSVE-NEXT: lsr w2, w2, #8 +; NONEON-NOSVE-NEXT: strb w4, [sp, #75] +; NONEON-NOSVE-NEXT: mul w8, w8, w14 +; NONEON-NOSVE-NEXT: lsr w17, w17, #8 +; NONEON-NOSVE-NEXT: strb w3, [sp, #74] +; NONEON-NOSVE-NEXT: lsr w13, w13, #8 +; NONEON-NOSVE-NEXT: strb w2, [sp, #73] +; NONEON-NOSVE-NEXT: ldr x27, [sp, #80] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w10, w10, #8 +; NONEON-NOSVE-NEXT: strb w0, [sp, #72] +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: strb w17, [sp, #71] +; NONEON-NOSVE-NEXT: strb w15, [sp, #70] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w13, [sp, #69] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w12, [sp, #68] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #112] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w11, [sp, #67] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #96] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w10, [sp, #66] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %1 = zext <16 x i8> %op1 to <16 x i16> %2 = zext <16 x i8> %op2 to <16 x i16> @@ -683,15 +1516,251 @@ define void @umulh_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umulh_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umull2 v4.8h, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umull v0.8h, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: umull2 v1.8h, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: umull v2.8h, v2.8b, v3.8b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v4.16b -; NONEON-NOSVE-NEXT: uzp2 v1.16b, v2.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #384 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #288] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #304] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #320] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #336] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 384 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov x29, x0 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: str q1, [sp, #160] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: str q3, [sp, #144] +; NONEON-NOSVE-NEXT: str q2, [sp, #192] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #185] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #186] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #187] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #188] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #189] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #229] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #227] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #228] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #190] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #191] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #177] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #226] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #214] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #215] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #178] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #179] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #212] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #213] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #180] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #181] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #247] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #246] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #244] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #182] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #183] +; NONEON-NOSVE-NEXT: mul w26, w12, w16 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #242] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #250] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #232] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #233] +; NONEON-NOSVE-NEXT: mul w30, w10, w12 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #255] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #253] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #234] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #235] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #248] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #249] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #210] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #236] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #237] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #211] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #208] +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #209] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #238] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #239] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #222] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #223] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #220] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #224] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #225] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #221] +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #219] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #230] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #231] +; NONEON-NOSVE-NEXT: mul w27, w8, w14 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #245] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #217] +; NONEON-NOSVE-NEXT: mul w9, w9, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #251] +; NONEON-NOSVE-NEXT: mul w25, w13, w14 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #243] +; NONEON-NOSVE-NEXT: lsr w14, w27, #8 +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #218] +; NONEON-NOSVE-NEXT: lsr w17, w9, #8 +; NONEON-NOSVE-NEXT: mul w28, w11, w13 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #216] +; NONEON-NOSVE-NEXT: strb w14, [sp, #287] +; NONEON-NOSVE-NEXT: lsr w14, w25, #8 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #241] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #240] +; NONEON-NOSVE-NEXT: strb w14, [sp, #285] +; NONEON-NOSVE-NEXT: lsr w14, w28, #8 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #254] +; NONEON-NOSVE-NEXT: mul w8, w25, w8 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #252] +; NONEON-NOSVE-NEXT: strb w14, [sp, #283] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w9, w25, w9 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w17, [sp, #286] +; NONEON-NOSVE-NEXT: mul w12, w14, w12 +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: lsr w17, w26, #8 +; NONEON-NOSVE-NEXT: mul w10, w25, w10 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w14, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #281] +; NONEON-NOSVE-NEXT: mul w11, w25, w11 +; NONEON-NOSVE-NEXT: strb w17, [sp, #284] +; NONEON-NOSVE-NEXT: lsr w17, w30, #8 +; NONEON-NOSVE-NEXT: mul w13, w14, w13 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #280] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #320] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: mul w10, w10, w15 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #279] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w11, w11, w16 +; NONEON-NOSVE-NEXT: strb w9, [sp, #278] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: mul w12, w12, w18 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #277] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #276] +; NONEON-NOSVE-NEXT: mul w13, w13, w0 +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w10, w10, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #275] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w2 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #274] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #273] +; NONEON-NOSVE-NEXT: mul w12, w12, w3 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w13, w13, w4 +; NONEON-NOSVE-NEXT: strb w9, [sp, #272] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: mul w10, w10, w5 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #271] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #270] +; NONEON-NOSVE-NEXT: mul w11, w11, w6 +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w12, w12, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #269] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: mul w13, w13, w19 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #268] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #267] +; NONEON-NOSVE-NEXT: mul w10, w10, w20 +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w11, w11, w21 +; NONEON-NOSVE-NEXT: strb w9, [sp, #266] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w12, w12, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #265] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #264] +; NONEON-NOSVE-NEXT: mul w13, w13, w23 +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w15, w14, [sp, #16] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w10, w10, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #263] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w27 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #262] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #261] +; NONEON-NOSVE-NEXT: mul w12, w12, w15 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: strb w17, [sp, #282] +; NONEON-NOSVE-NEXT: mul w13, w13, w14 +; NONEON-NOSVE-NEXT: strb w9, [sp, #260] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #259] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: strb w9, [sp, #258] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #257] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #336] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #256] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #304] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #256] +; NONEON-NOSVE-NEXT: stp q0, q1, [x29] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #288] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #384 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -730,11 +1799,20 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w8, w10 +; NONEON-NOSVE-NEXT: mul w9, w9, w11 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <2 x i16> %op1 to <2 x i32> %2 = zext <2 x i16> %op2 to <2 x i32> @@ -764,8 +1842,31 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull v0.4s, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: shrn v0.4h, v0.4s, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #18] +; NONEON-NOSVE-NEXT: mul w11, w11, w12 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: mul w10, w10, w13 +; NONEON-NOSVE-NEXT: mul w9, w9, w14 +; NONEON-NOSVE-NEXT: mul w8, w8, w12 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: strh w11, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <4 x i16> %op1 to <4 x i32> %2 = zext <4 x i16> %op2 to <4 x i32> @@ -795,9 +1896,54 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull2 v2.4s, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: umull v0.4s, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #62] +; NONEON-NOSVE-NEXT: mul w15, w15, w16 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #48] +; NONEON-NOSVE-NEXT: mul w14, w14, w17 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #56] +; NONEON-NOSVE-NEXT: mul w13, w13, w18 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #60] +; NONEON-NOSVE-NEXT: mul w12, w12, w16 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #58] +; NONEON-NOSVE-NEXT: lsr w15, w15, #16 +; NONEON-NOSVE-NEXT: mul w11, w11, w0 +; NONEON-NOSVE-NEXT: lsr w14, w14, #16 +; NONEON-NOSVE-NEXT: mul w10, w10, w18 +; NONEON-NOSVE-NEXT: lsr w13, w13, #16 +; NONEON-NOSVE-NEXT: strh w15, [sp, #78] +; NONEON-NOSVE-NEXT: mul w9, w9, w16 +; NONEON-NOSVE-NEXT: lsr w12, w12, #16 +; NONEON-NOSVE-NEXT: strh w14, [sp, #76] +; NONEON-NOSVE-NEXT: mul w8, w8, w17 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: strh w13, [sp, #74] +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: strh w12, [sp, #72] +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: strh w11, [sp, #70] +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w10, [sp, #68] +; NONEON-NOSVE-NEXT: strh w9, [sp, #66] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %1 = zext <8 x i16> %op1 to <8 x i32> %2 = zext <8 x i16> %op2 to <8 x i32> @@ -830,15 +1976,125 @@ define void @umulh_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umulh_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umull2 v4.4s, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umull v0.4s, v1.4h, v0.4h -; NONEON-NOSVE-NEXT: umull2 v1.4s, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: umull v2.4s, v2.4h, v3.4h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp2 v1.8h, v2.8h, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #240 +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 240 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str q3, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #50] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #98] +; NONEON-NOSVE-NEXT: ldrh w20, [sp, #100] +; NONEON-NOSVE-NEXT: ldrh w21, [sp, #102] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #54] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #104] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #106] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #108] +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #110] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #94] +; NONEON-NOSVE-NEXT: mul w8, w8, w15 +; NONEON-NOSVE-NEXT: ldrh w6, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w23, [sp, #82] +; NONEON-NOSVE-NEXT: mul w11, w11, w3 +; NONEON-NOSVE-NEXT: ldrh w25, [sp, #84] +; NONEON-NOSVE-NEXT: mul w13, w13, w23 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: mul w14, w14, w25 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: mul w12, w12, w6 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: mul w10, w10, w1 +; NONEON-NOSVE-NEXT: lsr w13, w13, #16 +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #118] +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #116] +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #114] +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w28, [sp, #126] +; NONEON-NOSVE-NEXT: mul w9, w9, w17 +; NONEON-NOSVE-NEXT: mul w21, w21, w22 +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #86] +; NONEON-NOSVE-NEXT: lsr w14, w14, #16 +; NONEON-NOSVE-NEXT: mul w20, w20, w24 +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #120] +; NONEON-NOSVE-NEXT: lsr w12, w12, #16 +; NONEON-NOSVE-NEXT: mul w19, w19, w26 +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #124] +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: mul w7, w7, w27 +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #122] +; NONEON-NOSVE-NEXT: lsr w21, w21, #16 +; NONEON-NOSVE-NEXT: mul w5, w5, w28 +; NONEON-NOSVE-NEXT: lsr w20, w20, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: mul w4, w4, w26 +; NONEON-NOSVE-NEXT: lsr w19, w19, #16 +; NONEON-NOSVE-NEXT: strh w21, [sp, #158] +; NONEON-NOSVE-NEXT: mul w2, w2, w27 +; NONEON-NOSVE-NEXT: lsr w7, w7, #16 +; NONEON-NOSVE-NEXT: strh w20, [sp, #156] +; NONEON-NOSVE-NEXT: mul w18, w18, w24 +; NONEON-NOSVE-NEXT: lsr w5, w5, #16 +; NONEON-NOSVE-NEXT: strh w19, [sp, #154] +; NONEON-NOSVE-NEXT: mul w16, w16, w22 +; NONEON-NOSVE-NEXT: lsr w4, w4, #16 +; NONEON-NOSVE-NEXT: strh w7, [sp, #152] +; NONEON-NOSVE-NEXT: lsr w2, w2, #16 +; NONEON-NOSVE-NEXT: strh w5, [sp, #150] +; NONEON-NOSVE-NEXT: lsr w18, w18, #16 +; NONEON-NOSVE-NEXT: strh w4, [sp, #148] +; NONEON-NOSVE-NEXT: lsr w16, w16, #16 +; NONEON-NOSVE-NEXT: strh w2, [sp, #146] +; NONEON-NOSVE-NEXT: strh w18, [sp, #144] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w16, [sp, #142] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w14, [sp, #140] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w13, [sp, #138] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w12, [sp, #136] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w11, [sp, #134] +; NONEON-NOSVE-NEXT: strh w10, [sp, #132] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #240 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -871,8 +2127,18 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull v0.2d, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: shrn v0.2s, v0.2d, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #16] +; NONEON-NOSVE-NEXT: umull x9, w9, w10 +; NONEON-NOSVE-NEXT: umull x8, w8, w11 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <2 x i32> %op1 to <2 x i64> %2 = zext <2 x i32> %op2 to <2 x i64> @@ -902,9 +2168,28 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull2 v2.2d, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: umull v0.2d, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w13, w12, [sp, #48] +; NONEON-NOSVE-NEXT: umull x11, w11, w12 +; NONEON-NOSVE-NEXT: ldp w12, w14, [sp, #56] +; NONEON-NOSVE-NEXT: umull x10, w10, w13 +; NONEON-NOSVE-NEXT: lsr x11, x11, #32 +; NONEON-NOSVE-NEXT: umull x9, w9, w14 +; NONEON-NOSVE-NEXT: umull x8, w8, w12 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #72] +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %1 = zext <4 x i32> %op1 to <4 x i64> %2 = zext <4 x i32> %op2 to <4 x i64> @@ -937,15 +2222,52 @@ define void @umulh_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umulh_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umull2 v4.2d, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umull v0.2d, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: umull2 v1.2d, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: umull v2.2d, v2.2s, v3.2s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp2 v1.4s, v2.4s, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #96] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #104] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w17, w16, [sp, #112] +; NONEON-NOSVE-NEXT: umull x15, w15, w16 +; NONEON-NOSVE-NEXT: ldp w16, w18, [sp, #120] +; NONEON-NOSVE-NEXT: umull x14, w14, w17 +; NONEON-NOSVE-NEXT: ldp w17, w1, [sp, #80] +; NONEON-NOSVE-NEXT: umull x13, w13, w18 +; NONEON-NOSVE-NEXT: lsr x15, x15, #32 +; NONEON-NOSVE-NEXT: umull x12, w12, w16 +; NONEON-NOSVE-NEXT: lsr x14, x14, #32 +; NONEON-NOSVE-NEXT: ldp w16, w18, [sp, #88] +; NONEON-NOSVE-NEXT: umull x11, w11, w1 +; NONEON-NOSVE-NEXT: lsr x13, x13, #32 +; NONEON-NOSVE-NEXT: stp w14, w15, [sp, #152] +; NONEON-NOSVE-NEXT: umull x10, w10, w17 +; NONEON-NOSVE-NEXT: lsr x12, x12, #32 +; NONEON-NOSVE-NEXT: umull x9, w9, w18 +; NONEON-NOSVE-NEXT: umull x8, w8, w16 +; NONEON-NOSVE-NEXT: lsr x11, x11, #32 +; NONEON-NOSVE-NEXT: stp w12, w13, [sp, #144] +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -980,12 +2302,14 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d0 ; NONEON-NOSVE-NEXT: fmov x9, d1 ; NONEON-NOSVE-NEXT: umulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %1 = zext <1 x i64> %op1 to <1 x i128> %2 = zext <1 x i64> %op2 to <1 x i128> @@ -1015,15 +2339,17 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: fmov x11, d1 -; NONEON-NOSVE-NEXT: umulh x10, x10, x11 -; NONEON-NOSVE-NEXT: umulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp, #16] +; NONEON-NOSVE-NEXT: umulh x8, x8, x10 +; NONEON-NOSVE-NEXT: umulh x9, x9, x11 +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %1 = zext <2 x i64> %op1 to <2 x i128> %2 = zext <2 x i64> %op2 to <2 x i128> @@ -1056,27 +2382,29 @@ define void @umulh_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umulh_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x11, v0.d[1] -; NONEON-NOSVE-NEXT: mov x14, v3.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x10, v1.d[1] -; NONEON-NOSVE-NEXT: mov x13, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x12, d3 -; NONEON-NOSVE-NEXT: umulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov x9, d2 -; NONEON-NOSVE-NEXT: umulh x10, x10, x11 -; NONEON-NOSVE-NEXT: umulh x9, x9, x12 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: umulh x11, x13, x14 -; NONEON-NOSVE-NEXT: fmov d1, x10 -; NONEON-NOSVE-NEXT: fmov d2, x9 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] -; NONEON-NOSVE-NEXT: fmov d3, x11 -; NONEON-NOSVE-NEXT: mov v2.d[1], v3.d[0] -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x13, x12, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: umulh x10, x10, x12 +; NONEON-NOSVE-NEXT: ldp x14, x12, [sp, #48] +; NONEON-NOSVE-NEXT: umulh x11, x11, x13 +; NONEON-NOSVE-NEXT: umulh x8, x8, x12 +; NONEON-NOSVE-NEXT: umulh x9, x9, x14 +; NONEON-NOSVE-NEXT: stp x11, x10, [sp, #64] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll index ad75ba62e17cf8..7bdb4599707b0c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll @@ -21,8 +21,25 @@ define i8 @uaddv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w12, w13, w12 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: add w10, w12, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w14 +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) ret i8 %res @@ -40,8 +57,40 @@ define i8 @uaddv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #2] +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: add w11, w14, w13 +; NONEON-NOSVE-NEXT: add w9, w12, w9 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #7] +; NONEON-NOSVE-NEXT: add w10, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: add w9, w9, w16 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #9] +; NONEON-NOSVE-NEXT: add w12, w12, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w13, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #15] +; NONEON-NOSVE-NEXT: add w12, w12, w14 +; NONEON-NOSVE-NEXT: add w8, w8, w11 +; NONEON-NOSVE-NEXT: add w9, w10, w9 +; NONEON-NOSVE-NEXT: add w10, w12, w16 +; NONEON-NOSVE-NEXT: add w8, w8, w15 +; NONEON-NOSVE-NEXT: add w9, w9, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w13 +; NONEON-NOSVE-NEXT: add w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) ret i8 %res @@ -61,9 +110,72 @@ define i8 @uaddv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: uaddv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: addv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: add w9, w11, w10 +; NONEON-NOSVE-NEXT: add w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: add w11, w15, w14 +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #6] +; NONEON-NOSVE-NEXT: add w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: add w10, w14, w10 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #7] +; NONEON-NOSVE-NEXT: add w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w14, w13 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #9] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w11, w15, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #26] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: add w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: add w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #14] +; NONEON-NOSVE-NEXT: add w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #15] +; NONEON-NOSVE-NEXT: add w10, w13, w10 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: add w14, w15, w14 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #30] +; NONEON-NOSVE-NEXT: add w9, w9, w14 +; NONEON-NOSVE-NEXT: add w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w10, w10, w12 +; NONEON-NOSVE-NEXT: add w11, w16, w11 +; NONEON-NOSVE-NEXT: add w10, w10, w11 +; NONEON-NOSVE-NEXT: add w11, w17, w13 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op) @@ -82,8 +194,17 @@ define i16 @uaddv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: add w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) ret i16 %res @@ -101,8 +222,24 @@ define i16 @uaddv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w13, [sp] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w12, w13, w12 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: add w10, w12, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w14 +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a) ret i16 %res @@ -122,9 +259,40 @@ define i16 @uaddv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: uaddv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: add w9, w11, w10 +; NONEON-NOSVE-NEXT: add w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: add w13, w15, w14 +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: add w9, w12, w13 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #12] +; NONEON-NOSVE-NEXT: add w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #14] +; NONEON-NOSVE-NEXT: add w10, w14, w10 +; NONEON-NOSVE-NEXT: add w11, w15, w11 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w13, w12 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op) @@ -143,8 +311,12 @@ define i32 @uaddv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) ret i32 %res @@ -162,8 +334,13 @@ define i32 @uaddv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp], #16 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w0, w10, w8 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) ret i32 %res @@ -183,9 +360,20 @@ define i32 @uaddv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: uaddv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: addv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: add w9, w11, w9 +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w10, w14, w12 +; NONEON-NOSVE-NEXT: add w11, w15, w13 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op) @@ -203,8 +391,10 @@ define i64 @uaddv_v2i64(<2 x i64> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addp d0, v0.2d -; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: add x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) ret i64 %res @@ -223,9 +413,13 @@ define i64 @uaddv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: uaddv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: addp d0, v0.2d -; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp], #32 +; NONEON-NOSVE-NEXT: add x8, x10, x8 +; NONEON-NOSVE-NEXT: add x9, x11, x9 +; NONEON-NOSVE-NEXT: add x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op) @@ -247,8 +441,32 @@ define i8 @smaxv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a) ret i8 %res @@ -265,8 +483,55 @@ define i8 @smaxv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a) ret i8 %res @@ -285,9 +550,103 @@ define i8 @smaxv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: smaxv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smax v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op) @@ -305,8 +664,20 @@ define i16 @smaxv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a) ret i16 %res @@ -323,8 +694,31 @@ define i16 @smaxv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a) ret i16 %res @@ -343,9 +737,55 @@ define i16 @smaxv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: smaxv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smax v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smaxv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op) @@ -363,8 +803,13 @@ define i32 @smaxv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w0, w9, w8, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a) ret i32 %res @@ -381,8 +826,17 @@ define i32 @smaxv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) ret i32 %res @@ -401,9 +855,27 @@ define i32 @smaxv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: smaxv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smaxv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w11, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldp w10, w12, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w11, w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w10, w11 +; NONEON-NOSVE-NEXT: csel w10, w10, w11, gt +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w12, w9 +; NONEON-NOSVE-NEXT: csel w9, w12, w9, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op) @@ -424,11 +896,9 @@ define i64 @smaxv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmgt d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, gt ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a) ret i64 %res @@ -447,15 +917,17 @@ define i64 @smaxv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: smaxv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmgt d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp], #32 +; NONEON-NOSVE-NEXT: cmp x8, x9 +; NONEON-NOSVE-NEXT: csel x8, x8, x9, gt +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, gt +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, gt ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op) @@ -477,8 +949,32 @@ define i8 @sminv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a) ret i8 %res @@ -495,8 +991,55 @@ define i8 @sminv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a) ret i8 %res @@ -515,9 +1058,103 @@ define i8 @sminv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: sminv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smin v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: sminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op) @@ -535,8 +1172,20 @@ define i16 @sminv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a) ret i16 %res @@ -553,8 +1202,31 @@ define i16 @sminv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a) ret i16 %res @@ -573,9 +1245,55 @@ define i16 @sminv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: sminv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smin v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: sminv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op) @@ -593,8 +1311,13 @@ define i32 @sminv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w0, w9, w8, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a) ret i32 %res @@ -611,8 +1334,17 @@ define i32 @sminv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) ret i32 %res @@ -631,9 +1363,27 @@ define i32 @sminv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: sminv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: sminv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w11, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldp w10, w12, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w11, w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w10, w11 +; NONEON-NOSVE-NEXT: csel w10, w10, w11, lt +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w12, w9 +; NONEON-NOSVE-NEXT: csel w9, w12, w9, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op) @@ -654,11 +1404,9 @@ define i64 @sminv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmgt d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, lt ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a) ret i64 %res @@ -676,16 +1424,18 @@ define i64 @sminv_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: sminv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmgt d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp], #32 +; NONEON-NOSVE-NEXT: cmp x8, x9 +; NONEON-NOSVE-NEXT: csel x8, x8, x9, lt +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, lt +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, lt ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op) @@ -707,8 +1457,32 @@ define i8 @umaxv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a) ret i8 %res @@ -725,8 +1499,55 @@ define i8 @umaxv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a) ret i8 %res @@ -745,9 +1566,103 @@ define i8 @umaxv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: umaxv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umax v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op) @@ -765,8 +1680,20 @@ define i16 @umaxv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a) ret i16 %res @@ -783,8 +1710,31 @@ define i16 @umaxv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a) ret i16 %res @@ -803,9 +1753,55 @@ define i16 @umaxv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: umaxv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umax v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umaxv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op) @@ -823,8 +1819,13 @@ define i32 @umaxv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w0, w9, w8, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a) ret i32 %res @@ -841,8 +1842,17 @@ define i32 @umaxv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) ret i32 %res @@ -861,9 +1871,27 @@ define i32 @umaxv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: umaxv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umaxv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w11, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldp w10, w12, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w11, w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w10, w11 +; NONEON-NOSVE-NEXT: csel w10, w10, w11, hi +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w12, w9 +; NONEON-NOSVE-NEXT: csel w9, w12, w9, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op) @@ -884,11 +1912,9 @@ define i64 @umaxv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmhi d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, hi ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a) ret i64 %res @@ -907,15 +1933,17 @@ define i64 @umaxv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: umaxv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmhi d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp], #32 +; NONEON-NOSVE-NEXT: cmp x8, x9 +; NONEON-NOSVE-NEXT: csel x8, x8, x9, hi +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, hi +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, hi ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op) @@ -937,8 +1965,32 @@ define i8 @uminv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a) ret i8 %res @@ -955,8 +2007,55 @@ define i8 @uminv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a) ret i8 %res @@ -975,9 +2074,103 @@ define i8 @uminv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: uminv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umin v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op) @@ -995,8 +2188,20 @@ define i16 @uminv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a) ret i16 %res @@ -1013,8 +2218,31 @@ define i16 @uminv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a) ret i16 %res @@ -1033,9 +2261,55 @@ define i16 @uminv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: uminv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umin v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uminv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op) @@ -1053,8 +2327,13 @@ define i32 @uminv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w0, w9, w8, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a) ret i32 %res @@ -1071,8 +2350,17 @@ define i32 @uminv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) ret i32 %res @@ -1091,9 +2379,27 @@ define i32 @uminv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: uminv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uminv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w11, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldp w10, w12, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w11, w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w10, w11 +; NONEON-NOSVE-NEXT: csel w10, w10, w11, lo +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w12, w9 +; NONEON-NOSVE-NEXT: csel w9, w12, w9, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op) @@ -1114,11 +2420,9 @@ define i64 @uminv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmhi d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, lo ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a) ret i64 %res @@ -1136,16 +2440,18 @@ define i64 @uminv_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: uminv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmhi d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp], #32 +; NONEON-NOSVE-NEXT: cmp x8, x9 +; NONEON-NOSVE-NEXT: csel x8, x8, x9, lo +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, lo +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, lo ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 99f8aef9f2b22d..cb1fb20ec9d8d7 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -28,31 +28,31 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: smov w11, v1.h[0] -; NONEON-NOSVE-NEXT: smov w12, v0.h[0] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w14, v1.h[2] -; NONEON-NOSVE-NEXT: smov w15, v0.h[2] -; NONEON-NOSVE-NEXT: smov w17, v1.h[3] -; NONEON-NOSVE-NEXT: smov w18, v0.h[3] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #10] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: ldrsb w17, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #8] +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.h[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w10, w16, w14, w15 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = srem <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -86,49 +86,51 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: smov w11, v1.b[0] -; NONEON-NOSVE-NEXT: smov w12, v0.b[0] -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w14, v1.b[2] -; NONEON-NOSVE-NEXT: smov w15, v0.b[2] -; NONEON-NOSVE-NEXT: smov w17, v1.b[3] -; NONEON-NOSVE-NEXT: smov w18, v0.b[3] -; NONEON-NOSVE-NEXT: smov w1, v1.b[4] -; NONEON-NOSVE-NEXT: smov w2, v0.b[4] -; NONEON-NOSVE-NEXT: smov w4, v1.b[5] -; NONEON-NOSVE-NEXT: smov w5, v0.b[5] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.b[7] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[6] -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: smov w14, v0.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[2], w8 -; NONEON-NOSVE-NEXT: sdiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: mov v2.b[3], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: mov v2.b[4], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.b[6], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = srem <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -182,108 +184,90 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #-80]! // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: smov w11, v1.b[0] -; NONEON-NOSVE-NEXT: smov w12, v0.b[0] -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w14, v1.b[2] -; NONEON-NOSVE-NEXT: smov w15, v0.b[2] -; NONEON-NOSVE-NEXT: smov w17, v1.b[3] -; NONEON-NOSVE-NEXT: smov w18, v0.b[3] -; NONEON-NOSVE-NEXT: smov w1, v1.b[4] -; NONEON-NOSVE-NEXT: smov w2, v0.b[4] -; NONEON-NOSVE-NEXT: smov w4, v1.b[5] -; NONEON-NOSVE-NEXT: smov w5, v0.b[5] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 -; NONEON-NOSVE-NEXT: smov w7, v1.b[6] -; NONEON-NOSVE-NEXT: smov w19, v0.b[6] -; NONEON-NOSVE-NEXT: smov w21, v1.b[7] -; NONEON-NOSVE-NEXT: smov w22, v0.b[7] -; NONEON-NOSVE-NEXT: smov w24, v1.b[8] -; NONEON-NOSVE-NEXT: smov w25, v0.b[8] -; NONEON-NOSVE-NEXT: smov w27, v1.b[9] -; NONEON-NOSVE-NEXT: smov w28, v0.b[9] -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.b[11] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[10] -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[10] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: smov w14, v0.b[11] -; NONEON-NOSVE-NEXT: smov w16, v1.b[12] -; NONEON-NOSVE-NEXT: mov v2.b[2], w8 -; NONEON-NOSVE-NEXT: sdiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: smov w17, v0.b[12] -; NONEON-NOSVE-NEXT: smov w0, v1.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[3], w8 -; NONEON-NOSVE-NEXT: sdiv w6, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: smov w1, v0.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[4], w8 -; NONEON-NOSVE-NEXT: sdiv w20, w19, w7 -; NONEON-NOSVE-NEXT: msub w8, w6, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: sdiv w23, w22, w21 -; NONEON-NOSVE-NEXT: msub w8, w20, w7, w19 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[6], w8 -; NONEON-NOSVE-NEXT: sdiv w26, w25, w24 -; NONEON-NOSVE-NEXT: msub w8, w23, w21, w22 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w28, w27 -; NONEON-NOSVE-NEXT: msub w8, w26, w24, w25 -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[8], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w27, w28 -; NONEON-NOSVE-NEXT: mov v2.b[9], w8 -; NONEON-NOSVE-NEXT: sdiv w15, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: smov w10, v1.b[14] -; NONEON-NOSVE-NEXT: smov w11, v0.b[14] -; NONEON-NOSVE-NEXT: mov v2.b[10], w8 -; NONEON-NOSVE-NEXT: sdiv w18, w17, w16 -; NONEON-NOSVE-NEXT: msub w8, w15, w13, w14 -; NONEON-NOSVE-NEXT: smov w13, v1.b[15] -; NONEON-NOSVE-NEXT: smov w14, v0.b[15] -; NONEON-NOSVE-NEXT: mov v2.b[11], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w1, w0 -; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 -; NONEON-NOSVE-NEXT: mov v2.b[12], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w0, w1 -; NONEON-NOSVE-NEXT: mov v2.b[13], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.b[14], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.b[15], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp], #80 // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = srem <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -375,275 +359,175 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: srem_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #320 -; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #224] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #240] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #256] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #272] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #288] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #304] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 320 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 -; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: str x0, [sp, #216] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w4, v3.b[1] -; NONEON-NOSVE-NEXT: smov w1, v2.b[1] -; NONEON-NOSVE-NEXT: smov w7, v3.b[7] -; NONEON-NOSVE-NEXT: smov w5, v2.b[7] -; NONEON-NOSVE-NEXT: smov w6, v3.b[8] -; NONEON-NOSVE-NEXT: smov w3, v2.b[8] -; NONEON-NOSVE-NEXT: smov w22, v3.b[9] -; NONEON-NOSVE-NEXT: smov w20, v2.b[9] -; NONEON-NOSVE-NEXT: smov w13, v3.b[0] -; NONEON-NOSVE-NEXT: smov w17, v3.b[3] -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #100] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w8, v1.b[0] -; NONEON-NOSVE-NEXT: str w9, [sp, #108] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w9, v0.b[0] -; NONEON-NOSVE-NEXT: smov w14, v2.b[3] -; NONEON-NOSVE-NEXT: smov w15, v3.b[4] -; NONEON-NOSVE-NEXT: smov w12, v2.b[4] -; NONEON-NOSVE-NEXT: smov w2, v3.b[5] -; NONEON-NOSVE-NEXT: smov w18, v2.b[5] -; NONEON-NOSVE-NEXT: smov w0, v3.b[6] -; NONEON-NOSVE-NEXT: smov w16, v2.b[6] -; NONEON-NOSVE-NEXT: smov w21, v3.b[10] -; NONEON-NOSVE-NEXT: smov w19, v2.b[10] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: ldr w30, [sp, #36] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: str w10, [sp, #116] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[2] -; NONEON-NOSVE-NEXT: smov w9, v0.b[2] -; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #44] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[3] -; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #52] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w9, v0.b[3] -; NONEON-NOSVE-NEXT: sdiv w26, w14, w17 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w11, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[4] -; NONEON-NOSVE-NEXT: smov w9, v0.b[4] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #60] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[5] -; NONEON-NOSVE-NEXT: smov w9, v0.b[5] -; NONEON-NOSVE-NEXT: str w8, [sp, #96] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #104] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #68] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[6] -; NONEON-NOSVE-NEXT: smov w9, v0.b[6] -; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #80] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #112] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[7] -; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #88] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w9, v0.b[7] -; NONEON-NOSVE-NEXT: sdiv w25, w12, w15 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #132] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[8] -; NONEON-NOSVE-NEXT: smov w9, v0.b[8] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #140] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[9] -; NONEON-NOSVE-NEXT: smov w9, v0.b[9] -; NONEON-NOSVE-NEXT: str w8, [sp, #148] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #156] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w11, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[10] -; NONEON-NOSVE-NEXT: smov w9, v0.b[10] -; NONEON-NOSVE-NEXT: str w10, [sp, #128] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #204] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[11] -; NONEON-NOSVE-NEXT: smov w9, v0.b[11] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #212] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[12] -; NONEON-NOSVE-NEXT: smov w9, v0.b[12] -; NONEON-NOSVE-NEXT: str w8, [sp, #172] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #180] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #200] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[13] -; NONEON-NOSVE-NEXT: smov w9, v0.b[13] -; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #164] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w11, v3.b[2] -; NONEON-NOSVE-NEXT: str w9, [sp, #176] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #188] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[14] -; NONEON-NOSVE-NEXT: smov w9, v0.b[14] -; NONEON-NOSVE-NEXT: str w8, [sp, #144] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #152] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #184] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v2.b[2] -; NONEON-NOSVE-NEXT: sdiv w8, w1, w4 -; NONEON-NOSVE-NEXT: str w10, [sp, #160] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w10, v2.b[0] -; NONEON-NOSVE-NEXT: str w8, [sp, #24] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w8, w5, w7 -; NONEON-NOSVE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w8, w3, w6 -; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w8, w20, w22 -; NONEON-NOSVE-NEXT: sdiv w24, w10, w13 -; NONEON-NOSVE-NEXT: str w8, [sp, #32] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: ldp w29, w8, [sp, #40] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w30, w29 -; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #224] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s4, w8 -; NONEON-NOSVE-NEXT: sdiv w23, w9, w11 -; NONEON-NOSVE-NEXT: msub w10, w24, w13, w10 -; NONEON-NOSVE-NEXT: ldr w13, [sp, #24] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w24, [sp, #100] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w13, w13, w4, w1 -; NONEON-NOSVE-NEXT: ldr w1, [sp, #116] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w4, [sp, #108] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s5, w10 -; NONEON-NOSVE-NEXT: msub w1, w1, w24, w4 -; NONEON-NOSVE-NEXT: mov v5.b[1], w13 -; NONEON-NOSVE-NEXT: mov v4.b[1], w1 -; NONEON-NOSVE-NEXT: ldr w1, [sp, #120] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w23, w11, w9 -; NONEON-NOSVE-NEXT: ldr w11, [sp, #48] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w28, w18, w2 -; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #52] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #272] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w26, w17, w14 -; NONEON-NOSVE-NEXT: ldr w14, [sp, #72] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w11, w10 -; NONEON-NOSVE-NEXT: ldr w17, [sp, #96] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: smov w10, v3.b[11] -; NONEON-NOSVE-NEXT: smov w11, v2.b[11] -; NONEON-NOSVE-NEXT: mov v4.b[2], w9 -; NONEON-NOSVE-NEXT: mov v5.b[3], w8 -; NONEON-NOSVE-NEXT: msub w8, w25, w15, w12 -; NONEON-NOSVE-NEXT: ldp w13, w9, [sp, #76] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w27, w16, w0 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #104] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #256] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w14, w13 -; NONEON-NOSVE-NEXT: ldr w14, [sp, #60] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[4], w8 -; NONEON-NOSVE-NEXT: msub w8, w28, w2, w18 -; NONEON-NOSVE-NEXT: ldr w2, [sp, #156] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[3], w9 -; NONEON-NOSVE-NEXT: ldp w12, w9, [sp, #64] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[5], w8 -; NONEON-NOSVE-NEXT: msub w8, w27, w0, w16 -; NONEON-NOSVE-NEXT: ldr w0, [sp, #132] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w4, w19, w21 -; NONEON-NOSVE-NEXT: msub w9, w9, w14, w12 -; NONEON-NOSVE-NEXT: smov w12, v3.b[12] -; NONEON-NOSVE-NEXT: smov w14, v2.b[12] -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #240] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[6], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[4], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w7, w5 -; NONEON-NOSVE-NEXT: ldr w5, [sp, #204] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w17, w15 -; NONEON-NOSVE-NEXT: ldr w17, [sp, #84] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[7], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w13, w11, w10 -; NONEON-NOSVE-NEXT: mov v4.b[5], w9 -; NONEON-NOSVE-NEXT: ldp w16, w9, [sp, #88] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w6, w3 -; NONEON-NOSVE-NEXT: ldr w3, [sp, #148] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w17, w16 -; NONEON-NOSVE-NEXT: smov w16, v3.b[13] -; NONEON-NOSVE-NEXT: smov w17, v2.b[13] -; NONEON-NOSVE-NEXT: mov v5.b[8], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[6], w9 -; NONEON-NOSVE-NEXT: msub w8, w8, w22, w20 -; NONEON-NOSVE-NEXT: sdiv w15, w14, w12 -; NONEON-NOSVE-NEXT: ldp w18, w9, [sp, #136] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[9], w8 -; NONEON-NOSVE-NEXT: msub w8, w4, w21, w19 -; NONEON-NOSVE-NEXT: msub w9, w9, w0, w18 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #304] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #288] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[7], w9 -; NONEON-NOSVE-NEXT: mov v5.b[10], w8 -; NONEON-NOSVE-NEXT: msub w8, w13, w10, w11 -; NONEON-NOSVE-NEXT: ldp w0, w9, [sp, #124] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #196] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w13, [sp, #192] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w18, w17, w16 -; NONEON-NOSVE-NEXT: msub w9, w9, w1, w0 -; NONEON-NOSVE-NEXT: mov v5.b[11], w8 -; NONEON-NOSVE-NEXT: smov w0, v3.b[14] -; NONEON-NOSVE-NEXT: msub w10, w10, w13, w11 -; NONEON-NOSVE-NEXT: smov w1, v2.b[14] -; NONEON-NOSVE-NEXT: msub w8, w15, w12, w14 -; NONEON-NOSVE-NEXT: mov v4.b[8], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #164] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w15, w13, [sp, #168] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w3, w2 -; NONEON-NOSVE-NEXT: mov v5.b[12], w8 -; NONEON-NOSVE-NEXT: ldp w4, w3, [sp, #208] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w14, w12, [sp, #176] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[9], w9 -; NONEON-NOSVE-NEXT: sdiv w2, w1, w0 -; NONEON-NOSVE-NEXT: smov w9, v3.b[15] -; NONEON-NOSVE-NEXT: msub w3, w3, w5, w4 -; NONEON-NOSVE-NEXT: smov w4, v2.b[15] -; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 -; NONEON-NOSVE-NEXT: ldr w16, [sp, #144] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[10], w3 -; NONEON-NOSVE-NEXT: mov v5.b[13], w8 -; NONEON-NOSVE-NEXT: mov v4.b[11], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #188] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w11, w4, w9 -; NONEON-NOSVE-NEXT: msub w8, w2, w0, w1 -; NONEON-NOSVE-NEXT: msub w10, w10, w13, w12 -; NONEON-NOSVE-NEXT: smov w12, v1.b[15] -; NONEON-NOSVE-NEXT: smov w13, v0.b[15] -; NONEON-NOSVE-NEXT: mov v5.b[14], w8 -; NONEON-NOSVE-NEXT: mov v4.b[12], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #184] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w10, w10, w15, w14 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #152] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w14, w13, w12 -; NONEON-NOSVE-NEXT: msub w8, w11, w9, w4 -; NONEON-NOSVE-NEXT: mov v4.b[13], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #160] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[15], w8 -; NONEON-NOSVE-NEXT: ldr x8, [sp, #216] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w10, w10, w16, w15 -; NONEON-NOSVE-NEXT: mov v4.b[14], w10 -; NONEON-NOSVE-NEXT: msub w9, w14, w12, w13 -; NONEON-NOSVE-NEXT: mov v4.b[15], w9 -; NONEON-NOSVE-NEXT: stp q5, q4, [x8] -; NONEON-NOSVE-NEXT: add sp, sp, #320 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #62] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #61] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #59] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #58] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #57] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #55] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #54] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #53] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #51] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #50] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #49] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -669,29 +553,31 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: smov w11, v1.h[0] -; NONEON-NOSVE-NEXT: smov w12, v0.h[0] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w14, v1.h[2] -; NONEON-NOSVE-NEXT: smov w15, v0.h[2] -; NONEON-NOSVE-NEXT: smov w17, v1.h[3] -; NONEON-NOSVE-NEXT: smov w18, v0.h[3] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.h[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = srem <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -724,47 +610,50 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smov w11, v1.h[0] -; NONEON-NOSVE-NEXT: smov w12, v0.h[0] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w14, v1.h[2] -; NONEON-NOSVE-NEXT: smov w15, v0.h[2] -; NONEON-NOSVE-NEXT: smov w17, v1.h[3] -; NONEON-NOSVE-NEXT: smov w18, v0.h[3] -; NONEON-NOSVE-NEXT: smov w1, v1.h[4] -; NONEON-NOSVE-NEXT: smov w2, v0.h[4] -; NONEON-NOSVE-NEXT: smov w4, v1.h[5] -; NONEON-NOSVE-NEXT: smov w5, v0.h[5] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.h[7] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: smov w11, v0.h[6] -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: smov w14, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: sdiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = srem <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -813,135 +702,95 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: srem_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #144 -; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #80] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #96] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #112] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 -; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w20, v1.h[0] -; NONEON-NOSVE-NEXT: smov w21, v0.h[0] -; NONEON-NOSVE-NEXT: smov w19, v0.h[3] -; NONEON-NOSVE-NEXT: smov w5, v1.h[4] -; NONEON-NOSVE-NEXT: smov w2, v0.h[4] -; NONEON-NOSVE-NEXT: smov w1, v3.h[1] -; NONEON-NOSVE-NEXT: smov w23, v2.h[1] -; NONEON-NOSVE-NEXT: smov w25, v3.h[0] -; NONEON-NOSVE-NEXT: smov w26, v2.h[0] -; NONEON-NOSVE-NEXT: smov w6, v1.h[5] -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w8, v1.h[2] -; NONEON-NOSVE-NEXT: smov w9, v0.h[2] -; NONEON-NOSVE-NEXT: smov w3, v0.h[5] -; NONEON-NOSVE-NEXT: smov w4, v1.h[6] -; NONEON-NOSVE-NEXT: smov w7, v0.h[6] -; NONEON-NOSVE-NEXT: smov w28, v3.h[2] -; NONEON-NOSVE-NEXT: smov w29, v2.h[2] -; NONEON-NOSVE-NEXT: smov w15, v3.h[3] -; NONEON-NOSVE-NEXT: smov w13, v2.h[3] -; NONEON-NOSVE-NEXT: smov w12, v3.h[4] -; NONEON-NOSVE-NEXT: smov w14, v3.h[5] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w11, w21, w20 -; NONEON-NOSVE-NEXT: str w10, [sp, #44] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.h[3] -; NONEON-NOSVE-NEXT: stp w8, w11, [sp] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w11, v2.h[4] -; NONEON-NOSVE-NEXT: ldr w22, [sp, #4] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w20, w22, w20, w21 -; NONEON-NOSVE-NEXT: sdiv w9, w19, w8 -; NONEON-NOSVE-NEXT: str w10, [sp, #32] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w10, v3.h[6] -; NONEON-NOSVE-NEXT: fmov s5, w20 -; NONEON-NOSVE-NEXT: smov w20, v3.h[7] -; NONEON-NOSVE-NEXT: sdiv w8, w2, w5 -; NONEON-NOSVE-NEXT: sdiv w24, w23, w1 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w27, w26, w25 -; NONEON-NOSVE-NEXT: msub w1, w24, w1, w23 -; NONEON-NOSVE-NEXT: ldp w24, w23, [sp, #40] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w9, w3, w6 -; NONEON-NOSVE-NEXT: msub w21, w27, w25, w26 -; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w23, w23, w25, w24 -; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s4, w21 -; NONEON-NOSVE-NEXT: mov v5.h[1], w23 -; NONEON-NOSVE-NEXT: ldp w23, w21, [sp, #28] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[1], w1 -; NONEON-NOSVE-NEXT: sdiv w8, w7, w4 -; NONEON-NOSVE-NEXT: msub w21, w21, w25, w23 -; NONEON-NOSVE-NEXT: smov w23, v2.h[7] -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #80] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.h[2], w21 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w30, w29, w28 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w9, v2.h[5] -; NONEON-NOSVE-NEXT: smov w8, v2.h[6] -; NONEON-NOSVE-NEXT: sdiv w18, w13, w15 -; NONEON-NOSVE-NEXT: msub w1, w30, w28, w29 -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[2], w1 -; NONEON-NOSVE-NEXT: sdiv w16, w11, w12 -; NONEON-NOSVE-NEXT: msub w13, w18, w15, w13 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #20] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w18, [sp] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w15, w15, w18, w19 -; NONEON-NOSVE-NEXT: mov v4.h[3], w13 -; NONEON-NOSVE-NEXT: smov w13, v1.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[3], w15 -; NONEON-NOSVE-NEXT: smov w15, v0.h[7] -; NONEON-NOSVE-NEXT: sdiv w17, w9, w14 -; NONEON-NOSVE-NEXT: msub w11, w16, w12, w11 -; NONEON-NOSVE-NEXT: ldr w12, [sp, #16] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w12, w12, w5, w2 -; NONEON-NOSVE-NEXT: mov v4.h[4], w11 -; NONEON-NOSVE-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.h[4], w12 -; NONEON-NOSVE-NEXT: msub w11, w11, w6, w3 -; NONEON-NOSVE-NEXT: sdiv w24, w8, w10 -; NONEON-NOSVE-NEXT: msub w9, w17, w14, w9 -; NONEON-NOSVE-NEXT: mov v5.h[5], w11 -; NONEON-NOSVE-NEXT: mov v4.h[5], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w4, w7 -; NONEON-NOSVE-NEXT: sdiv w18, w23, w20 -; NONEON-NOSVE-NEXT: msub w8, w24, w10, w8 -; NONEON-NOSVE-NEXT: mov v5.h[6], w9 -; NONEON-NOSVE-NEXT: mov v4.h[6], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w15, w13 -; NONEON-NOSVE-NEXT: msub w8, w18, w20, w23 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #96] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[7], w8 -; NONEON-NOSVE-NEXT: msub w9, w12, w13, w15 -; NONEON-NOSVE-NEXT: mov v5.h[7], w9 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #144 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -964,19 +813,20 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w11, v1.s[1] -; NONEON-NOSVE-NEXT: mov w12, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = srem <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -996,26 +846,28 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov w11, s1 -; NONEON-NOSVE-NEXT: fmov w12, s0 -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: mov w14, v1.s[2] -; NONEON-NOSVE-NEXT: mov w15, v0.s[2] -; NONEON-NOSVE-NEXT: mov w17, v1.s[3] -; NONEON-NOSVE-NEXT: mov w18, v0.s[3] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.s[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = srem <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -1039,61 +891,50 @@ define void @srem_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: srem_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -48 -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov w12, s0 -; NONEON-NOSVE-NEXT: fmov w3, s2 -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w11, s1 -; NONEON-NOSVE-NEXT: fmov w2, s3 -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w17, v3.s[1] -; NONEON-NOSVE-NEXT: mov w18, v2.s[1] -; NONEON-NOSVE-NEXT: mov w14, v1.s[2] -; NONEON-NOSVE-NEXT: mov w15, v0.s[2] -; NONEON-NOSVE-NEXT: mov w5, v3.s[2] -; NONEON-NOSVE-NEXT: mov w6, v2.s[2] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 -; NONEON-NOSVE-NEXT: mov w19, v3.s[3] -; NONEON-NOSVE-NEXT: mov w20, v2.s[3] -; NONEON-NOSVE-NEXT: mov w22, v1.s[3] -; NONEON-NOSVE-NEXT: mov w23, v0.s[3] -; NONEON-NOSVE-NEXT: sdiv w4, w3, w2 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s1, w11 -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w12, w4, w2, w3 -; NONEON-NOSVE-NEXT: fmov s0, w12 -; NONEON-NOSVE-NEXT: sdiv w1, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v1.s[1], w8 -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 -; NONEON-NOSVE-NEXT: msub w13, w1, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.s[1], w13 -; NONEON-NOSVE-NEXT: sdiv w7, w6, w5 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v1.s[2], w8 -; NONEON-NOSVE-NEXT: sdiv w21, w20, w19 -; NONEON-NOSVE-NEXT: msub w10, w7, w5, w6 -; NONEON-NOSVE-NEXT: mov v0.s[2], w10 -; NONEON-NOSVE-NEXT: sdiv w9, w23, w22 -; NONEON-NOSVE-NEXT: msub w10, w21, w19, w20 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v0.s[3], w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w22, w23 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #36] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #4] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -1116,13 +957,15 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = srem <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -1142,16 +985,19 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x11, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv x10, x11, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 -; NONEON-NOSVE-NEXT: sdiv x13, x12, x11 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = srem <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -1175,29 +1021,33 @@ define void @srem_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: srem_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: fmov x15, d2 -; NONEON-NOSVE-NEXT: mov x12, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x14, d3 -; NONEON-NOSVE-NEXT: mov x11, v3.d[1] -; NONEON-NOSVE-NEXT: mov x17, v1.d[1] -; NONEON-NOSVE-NEXT: mov x18, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv x10, x11, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x11, x10, x8, x9 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 -; NONEON-NOSVE-NEXT: sdiv x16, x15, x14 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: sdiv x13, x12, x11 -; NONEON-NOSVE-NEXT: msub x10, x16, x14, x15 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: sdiv x1, x18, x17 -; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 -; NONEON-NOSVE-NEXT: msub x11, x1, x17, x18 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -1229,37 +1079,31 @@ define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w11, v1.h[0] -; NONEON-NOSVE-NEXT: umov w12, v0.h[0] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w14, v1.h[2] -; NONEON-NOSVE-NEXT: umov w15, v0.h[2] -; NONEON-NOSVE-NEXT: umov w17, v1.h[3] -; NONEON-NOSVE-NEXT: umov w18, v0.h[3] -; NONEON-NOSVE-NEXT: and w11, w11, #0xff -; NONEON-NOSVE-NEXT: and w12, w12, #0xff -; NONEON-NOSVE-NEXT: and w8, w8, #0xff +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #8] ; NONEON-NOSVE-NEXT: udiv w13, w12, w11 -; NONEON-NOSVE-NEXT: and w9, w9, #0xff -; NONEON-NOSVE-NEXT: and w14, w14, #0xff -; NONEON-NOSVE-NEXT: and w15, w15, #0xff -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: and w12, w17, #0xff -; NONEON-NOSVE-NEXT: and w13, w18, #0xff -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w9, w13, w12 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.h[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w12, w13 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w10, w16, w14, w15 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = urem <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -1293,49 +1137,51 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w11, v1.b[0] -; NONEON-NOSVE-NEXT: umov w12, v0.b[0] -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w14, v1.b[2] -; NONEON-NOSVE-NEXT: umov w15, v0.b[2] -; NONEON-NOSVE-NEXT: umov w17, v1.b[3] -; NONEON-NOSVE-NEXT: umov w18, v0.b[3] -; NONEON-NOSVE-NEXT: umov w1, v1.b[4] -; NONEON-NOSVE-NEXT: umov w2, v0.b[4] -; NONEON-NOSVE-NEXT: umov w4, v1.b[5] -; NONEON-NOSVE-NEXT: umov w5, v0.b[5] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.b[7] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[6] -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: umov w14, v0.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[2], w8 -; NONEON-NOSVE-NEXT: udiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: mov v2.b[3], w8 -; NONEON-NOSVE-NEXT: udiv w9, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: mov v2.b[4], w8 -; NONEON-NOSVE-NEXT: udiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: udiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.b[6], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = urem <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -1389,108 +1235,90 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #-80]! // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: umov w11, v1.b[0] -; NONEON-NOSVE-NEXT: umov w12, v0.b[0] -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w14, v1.b[2] -; NONEON-NOSVE-NEXT: umov w15, v0.b[2] -; NONEON-NOSVE-NEXT: umov w17, v1.b[3] -; NONEON-NOSVE-NEXT: umov w18, v0.b[3] -; NONEON-NOSVE-NEXT: umov w1, v1.b[4] -; NONEON-NOSVE-NEXT: umov w2, v0.b[4] -; NONEON-NOSVE-NEXT: umov w4, v1.b[5] -; NONEON-NOSVE-NEXT: umov w5, v0.b[5] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 -; NONEON-NOSVE-NEXT: umov w7, v1.b[6] -; NONEON-NOSVE-NEXT: umov w19, v0.b[6] -; NONEON-NOSVE-NEXT: umov w21, v1.b[7] -; NONEON-NOSVE-NEXT: umov w22, v0.b[7] -; NONEON-NOSVE-NEXT: umov w24, v1.b[8] -; NONEON-NOSVE-NEXT: umov w25, v0.b[8] -; NONEON-NOSVE-NEXT: umov w27, v1.b[9] -; NONEON-NOSVE-NEXT: umov w28, v0.b[9] -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.b[11] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[10] -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[10] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: umov w14, v0.b[11] -; NONEON-NOSVE-NEXT: umov w16, v1.b[12] -; NONEON-NOSVE-NEXT: mov v2.b[2], w8 -; NONEON-NOSVE-NEXT: udiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: umov w17, v0.b[12] -; NONEON-NOSVE-NEXT: umov w0, v1.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[3], w8 -; NONEON-NOSVE-NEXT: udiv w6, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: umov w1, v0.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[4], w8 -; NONEON-NOSVE-NEXT: udiv w20, w19, w7 -; NONEON-NOSVE-NEXT: msub w8, w6, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: udiv w23, w22, w21 -; NONEON-NOSVE-NEXT: msub w8, w20, w7, w19 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[6], w8 -; NONEON-NOSVE-NEXT: udiv w26, w25, w24 -; NONEON-NOSVE-NEXT: msub w8, w23, w21, w22 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: udiv w9, w28, w27 -; NONEON-NOSVE-NEXT: msub w8, w26, w24, w25 -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[8], w8 -; NONEON-NOSVE-NEXT: udiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w27, w28 -; NONEON-NOSVE-NEXT: mov v2.b[9], w8 -; NONEON-NOSVE-NEXT: udiv w15, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: umov w10, v1.b[14] -; NONEON-NOSVE-NEXT: umov w11, v0.b[14] -; NONEON-NOSVE-NEXT: mov v2.b[10], w8 -; NONEON-NOSVE-NEXT: udiv w18, w17, w16 -; NONEON-NOSVE-NEXT: msub w8, w15, w13, w14 -; NONEON-NOSVE-NEXT: umov w13, v1.b[15] -; NONEON-NOSVE-NEXT: umov w14, v0.b[15] -; NONEON-NOSVE-NEXT: mov v2.b[11], w8 -; NONEON-NOSVE-NEXT: udiv w9, w1, w0 -; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 -; NONEON-NOSVE-NEXT: mov v2.b[12], w8 -; NONEON-NOSVE-NEXT: udiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w0, w1 -; NONEON-NOSVE-NEXT: mov v2.b[13], w8 -; NONEON-NOSVE-NEXT: udiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.b[14], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.b[15], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp], #80 // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = urem <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -1582,275 +1410,175 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: urem_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #320 -; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #224] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #240] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #256] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #272] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #288] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #304] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 320 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 -; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: str x0, [sp, #216] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w4, v3.b[1] -; NONEON-NOSVE-NEXT: umov w1, v2.b[1] -; NONEON-NOSVE-NEXT: umov w7, v3.b[7] -; NONEON-NOSVE-NEXT: umov w5, v2.b[7] -; NONEON-NOSVE-NEXT: umov w6, v3.b[8] -; NONEON-NOSVE-NEXT: umov w3, v2.b[8] -; NONEON-NOSVE-NEXT: umov w22, v3.b[9] -; NONEON-NOSVE-NEXT: umov w20, v2.b[9] -; NONEON-NOSVE-NEXT: umov w13, v3.b[0] -; NONEON-NOSVE-NEXT: umov w17, v3.b[3] -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #100] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w8, v1.b[0] -; NONEON-NOSVE-NEXT: str w9, [sp, #108] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w9, v0.b[0] -; NONEON-NOSVE-NEXT: umov w14, v2.b[3] -; NONEON-NOSVE-NEXT: umov w15, v3.b[4] -; NONEON-NOSVE-NEXT: umov w12, v2.b[4] -; NONEON-NOSVE-NEXT: umov w2, v3.b[5] -; NONEON-NOSVE-NEXT: umov w18, v2.b[5] -; NONEON-NOSVE-NEXT: umov w0, v3.b[6] -; NONEON-NOSVE-NEXT: umov w16, v2.b[6] -; NONEON-NOSVE-NEXT: umov w21, v3.b[10] -; NONEON-NOSVE-NEXT: umov w19, v2.b[10] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: ldr w30, [sp, #36] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: str w10, [sp, #116] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[2] -; NONEON-NOSVE-NEXT: umov w9, v0.b[2] -; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #44] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[3] -; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #52] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w9, v0.b[3] -; NONEON-NOSVE-NEXT: udiv w26, w14, w17 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w11, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[4] -; NONEON-NOSVE-NEXT: umov w9, v0.b[4] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #60] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[5] -; NONEON-NOSVE-NEXT: umov w9, v0.b[5] -; NONEON-NOSVE-NEXT: str w8, [sp, #96] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #104] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #68] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[6] -; NONEON-NOSVE-NEXT: umov w9, v0.b[6] -; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #80] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #112] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[7] -; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #88] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w9, v0.b[7] -; NONEON-NOSVE-NEXT: udiv w25, w12, w15 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #132] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[8] -; NONEON-NOSVE-NEXT: umov w9, v0.b[8] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #140] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[9] -; NONEON-NOSVE-NEXT: umov w9, v0.b[9] -; NONEON-NOSVE-NEXT: str w8, [sp, #148] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #156] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w11, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[10] -; NONEON-NOSVE-NEXT: umov w9, v0.b[10] -; NONEON-NOSVE-NEXT: str w10, [sp, #128] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #204] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[11] -; NONEON-NOSVE-NEXT: umov w9, v0.b[11] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #212] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[12] -; NONEON-NOSVE-NEXT: umov w9, v0.b[12] -; NONEON-NOSVE-NEXT: str w8, [sp, #172] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #180] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #200] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[13] -; NONEON-NOSVE-NEXT: umov w9, v0.b[13] -; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #164] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w11, v3.b[2] -; NONEON-NOSVE-NEXT: str w9, [sp, #176] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #188] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[14] -; NONEON-NOSVE-NEXT: umov w9, v0.b[14] -; NONEON-NOSVE-NEXT: str w8, [sp, #144] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #152] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #184] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v2.b[2] -; NONEON-NOSVE-NEXT: udiv w8, w1, w4 -; NONEON-NOSVE-NEXT: str w10, [sp, #160] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w10, v2.b[0] -; NONEON-NOSVE-NEXT: str w8, [sp, #24] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w8, w5, w7 -; NONEON-NOSVE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w8, w3, w6 -; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w8, w20, w22 -; NONEON-NOSVE-NEXT: udiv w24, w10, w13 -; NONEON-NOSVE-NEXT: str w8, [sp, #32] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: ldp w29, w8, [sp, #40] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w30, w29 -; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #224] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s4, w8 -; NONEON-NOSVE-NEXT: udiv w23, w9, w11 -; NONEON-NOSVE-NEXT: msub w10, w24, w13, w10 -; NONEON-NOSVE-NEXT: ldr w13, [sp, #24] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w24, [sp, #100] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w13, w13, w4, w1 -; NONEON-NOSVE-NEXT: ldr w1, [sp, #116] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w4, [sp, #108] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s5, w10 -; NONEON-NOSVE-NEXT: msub w1, w1, w24, w4 -; NONEON-NOSVE-NEXT: mov v5.b[1], w13 -; NONEON-NOSVE-NEXT: mov v4.b[1], w1 -; NONEON-NOSVE-NEXT: ldr w1, [sp, #120] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w23, w11, w9 -; NONEON-NOSVE-NEXT: ldr w11, [sp, #48] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w28, w18, w2 -; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #52] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #272] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w26, w17, w14 -; NONEON-NOSVE-NEXT: ldr w14, [sp, #72] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w11, w10 -; NONEON-NOSVE-NEXT: ldr w17, [sp, #96] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: umov w10, v3.b[11] -; NONEON-NOSVE-NEXT: umov w11, v2.b[11] -; NONEON-NOSVE-NEXT: mov v4.b[2], w9 -; NONEON-NOSVE-NEXT: mov v5.b[3], w8 -; NONEON-NOSVE-NEXT: msub w8, w25, w15, w12 -; NONEON-NOSVE-NEXT: ldp w13, w9, [sp, #76] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w27, w16, w0 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #104] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #256] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w14, w13 -; NONEON-NOSVE-NEXT: ldr w14, [sp, #60] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[4], w8 -; NONEON-NOSVE-NEXT: msub w8, w28, w2, w18 -; NONEON-NOSVE-NEXT: ldr w2, [sp, #156] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[3], w9 -; NONEON-NOSVE-NEXT: ldp w12, w9, [sp, #64] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[5], w8 -; NONEON-NOSVE-NEXT: msub w8, w27, w0, w16 -; NONEON-NOSVE-NEXT: ldr w0, [sp, #132] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w4, w19, w21 -; NONEON-NOSVE-NEXT: msub w9, w9, w14, w12 -; NONEON-NOSVE-NEXT: umov w12, v3.b[12] -; NONEON-NOSVE-NEXT: umov w14, v2.b[12] -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #240] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[6], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[4], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w7, w5 -; NONEON-NOSVE-NEXT: ldr w5, [sp, #204] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w17, w15 -; NONEON-NOSVE-NEXT: ldr w17, [sp, #84] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[7], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w13, w11, w10 -; NONEON-NOSVE-NEXT: mov v4.b[5], w9 -; NONEON-NOSVE-NEXT: ldp w16, w9, [sp, #88] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w6, w3 -; NONEON-NOSVE-NEXT: ldr w3, [sp, #148] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w17, w16 -; NONEON-NOSVE-NEXT: umov w16, v3.b[13] -; NONEON-NOSVE-NEXT: umov w17, v2.b[13] -; NONEON-NOSVE-NEXT: mov v5.b[8], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[6], w9 -; NONEON-NOSVE-NEXT: msub w8, w8, w22, w20 -; NONEON-NOSVE-NEXT: udiv w15, w14, w12 -; NONEON-NOSVE-NEXT: ldp w18, w9, [sp, #136] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[9], w8 -; NONEON-NOSVE-NEXT: msub w8, w4, w21, w19 -; NONEON-NOSVE-NEXT: msub w9, w9, w0, w18 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #304] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #288] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[7], w9 -; NONEON-NOSVE-NEXT: mov v5.b[10], w8 -; NONEON-NOSVE-NEXT: msub w8, w13, w10, w11 -; NONEON-NOSVE-NEXT: ldp w0, w9, [sp, #124] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #196] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w13, [sp, #192] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w18, w17, w16 -; NONEON-NOSVE-NEXT: msub w9, w9, w1, w0 -; NONEON-NOSVE-NEXT: mov v5.b[11], w8 -; NONEON-NOSVE-NEXT: umov w0, v3.b[14] -; NONEON-NOSVE-NEXT: msub w10, w10, w13, w11 -; NONEON-NOSVE-NEXT: umov w1, v2.b[14] -; NONEON-NOSVE-NEXT: msub w8, w15, w12, w14 -; NONEON-NOSVE-NEXT: mov v4.b[8], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #164] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w15, w13, [sp, #168] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w3, w2 -; NONEON-NOSVE-NEXT: mov v5.b[12], w8 -; NONEON-NOSVE-NEXT: ldp w4, w3, [sp, #208] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w14, w12, [sp, #176] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[9], w9 -; NONEON-NOSVE-NEXT: udiv w2, w1, w0 -; NONEON-NOSVE-NEXT: umov w9, v3.b[15] -; NONEON-NOSVE-NEXT: msub w3, w3, w5, w4 -; NONEON-NOSVE-NEXT: umov w4, v2.b[15] -; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 -; NONEON-NOSVE-NEXT: ldr w16, [sp, #144] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[10], w3 -; NONEON-NOSVE-NEXT: mov v5.b[13], w8 -; NONEON-NOSVE-NEXT: mov v4.b[11], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #188] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w11, w4, w9 -; NONEON-NOSVE-NEXT: msub w8, w2, w0, w1 -; NONEON-NOSVE-NEXT: msub w10, w10, w13, w12 -; NONEON-NOSVE-NEXT: umov w12, v1.b[15] -; NONEON-NOSVE-NEXT: umov w13, v0.b[15] -; NONEON-NOSVE-NEXT: mov v5.b[14], w8 -; NONEON-NOSVE-NEXT: mov v4.b[12], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #184] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w10, w10, w15, w14 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #152] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w14, w13, w12 -; NONEON-NOSVE-NEXT: msub w8, w11, w9, w4 -; NONEON-NOSVE-NEXT: mov v4.b[13], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #160] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[15], w8 -; NONEON-NOSVE-NEXT: ldr x8, [sp, #216] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w10, w10, w16, w15 -; NONEON-NOSVE-NEXT: mov v4.b[14], w10 -; NONEON-NOSVE-NEXT: msub w9, w14, w12, w13 -; NONEON-NOSVE-NEXT: mov v4.b[15], w9 -; NONEON-NOSVE-NEXT: stp q5, q4, [x8] -; NONEON-NOSVE-NEXT: add sp, sp, #320 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -1876,29 +1604,31 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w11, v1.h[0] -; NONEON-NOSVE-NEXT: umov w12, v0.h[0] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w14, v1.h[2] -; NONEON-NOSVE-NEXT: umov w15, v0.h[2] -; NONEON-NOSVE-NEXT: umov w17, v1.h[3] -; NONEON-NOSVE-NEXT: umov w18, v0.h[3] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.h[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = urem <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -1931,47 +1661,50 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umov w11, v1.h[0] -; NONEON-NOSVE-NEXT: umov w12, v0.h[0] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w14, v1.h[2] -; NONEON-NOSVE-NEXT: umov w15, v0.h[2] -; NONEON-NOSVE-NEXT: umov w17, v1.h[3] -; NONEON-NOSVE-NEXT: umov w18, v0.h[3] -; NONEON-NOSVE-NEXT: umov w1, v1.h[4] -; NONEON-NOSVE-NEXT: umov w2, v0.h[4] -; NONEON-NOSVE-NEXT: umov w4, v1.h[5] -; NONEON-NOSVE-NEXT: umov w5, v0.h[5] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.h[7] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: umov w11, v0.h[6] -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: umov w14, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: udiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: udiv w9, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: udiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: udiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = urem <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -2020,135 +1753,95 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: urem_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #144 -; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #80] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #96] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #112] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 -; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w20, v1.h[0] -; NONEON-NOSVE-NEXT: umov w21, v0.h[0] -; NONEON-NOSVE-NEXT: umov w19, v0.h[3] -; NONEON-NOSVE-NEXT: umov w5, v1.h[4] -; NONEON-NOSVE-NEXT: umov w2, v0.h[4] -; NONEON-NOSVE-NEXT: umov w1, v3.h[1] -; NONEON-NOSVE-NEXT: umov w23, v2.h[1] -; NONEON-NOSVE-NEXT: umov w25, v3.h[0] -; NONEON-NOSVE-NEXT: umov w26, v2.h[0] -; NONEON-NOSVE-NEXT: umov w6, v1.h[5] -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w8, v1.h[2] -; NONEON-NOSVE-NEXT: umov w9, v0.h[2] -; NONEON-NOSVE-NEXT: umov w3, v0.h[5] -; NONEON-NOSVE-NEXT: umov w4, v1.h[6] -; NONEON-NOSVE-NEXT: umov w7, v0.h[6] -; NONEON-NOSVE-NEXT: umov w28, v3.h[2] -; NONEON-NOSVE-NEXT: umov w29, v2.h[2] -; NONEON-NOSVE-NEXT: umov w15, v3.h[3] -; NONEON-NOSVE-NEXT: umov w13, v2.h[3] -; NONEON-NOSVE-NEXT: umov w12, v3.h[4] -; NONEON-NOSVE-NEXT: umov w14, v3.h[5] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w11, w21, w20 -; NONEON-NOSVE-NEXT: str w10, [sp, #44] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.h[3] -; NONEON-NOSVE-NEXT: stp w8, w11, [sp] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w11, v2.h[4] -; NONEON-NOSVE-NEXT: ldr w22, [sp, #4] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w20, w22, w20, w21 -; NONEON-NOSVE-NEXT: udiv w9, w19, w8 -; NONEON-NOSVE-NEXT: str w10, [sp, #32] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w10, v3.h[6] -; NONEON-NOSVE-NEXT: fmov s5, w20 -; NONEON-NOSVE-NEXT: umov w20, v3.h[7] -; NONEON-NOSVE-NEXT: udiv w8, w2, w5 -; NONEON-NOSVE-NEXT: udiv w24, w23, w1 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w27, w26, w25 -; NONEON-NOSVE-NEXT: msub w1, w24, w1, w23 -; NONEON-NOSVE-NEXT: ldp w24, w23, [sp, #40] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w9, w3, w6 -; NONEON-NOSVE-NEXT: msub w21, w27, w25, w26 -; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w23, w23, w25, w24 -; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s4, w21 -; NONEON-NOSVE-NEXT: mov v5.h[1], w23 -; NONEON-NOSVE-NEXT: ldp w23, w21, [sp, #28] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[1], w1 -; NONEON-NOSVE-NEXT: udiv w8, w7, w4 -; NONEON-NOSVE-NEXT: msub w21, w21, w25, w23 -; NONEON-NOSVE-NEXT: umov w23, v2.h[7] -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #80] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.h[2], w21 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w30, w29, w28 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w9, v2.h[5] -; NONEON-NOSVE-NEXT: umov w8, v2.h[6] -; NONEON-NOSVE-NEXT: udiv w18, w13, w15 -; NONEON-NOSVE-NEXT: msub w1, w30, w28, w29 -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[2], w1 -; NONEON-NOSVE-NEXT: udiv w16, w11, w12 -; NONEON-NOSVE-NEXT: msub w13, w18, w15, w13 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #20] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w18, [sp] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w15, w15, w18, w19 -; NONEON-NOSVE-NEXT: mov v4.h[3], w13 -; NONEON-NOSVE-NEXT: umov w13, v1.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[3], w15 -; NONEON-NOSVE-NEXT: umov w15, v0.h[7] -; NONEON-NOSVE-NEXT: udiv w17, w9, w14 -; NONEON-NOSVE-NEXT: msub w11, w16, w12, w11 -; NONEON-NOSVE-NEXT: ldr w12, [sp, #16] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w12, w12, w5, w2 -; NONEON-NOSVE-NEXT: mov v4.h[4], w11 -; NONEON-NOSVE-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.h[4], w12 -; NONEON-NOSVE-NEXT: msub w11, w11, w6, w3 -; NONEON-NOSVE-NEXT: udiv w24, w8, w10 -; NONEON-NOSVE-NEXT: msub w9, w17, w14, w9 -; NONEON-NOSVE-NEXT: mov v5.h[5], w11 -; NONEON-NOSVE-NEXT: mov v4.h[5], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w4, w7 -; NONEON-NOSVE-NEXT: udiv w18, w23, w20 -; NONEON-NOSVE-NEXT: msub w8, w24, w10, w8 -; NONEON-NOSVE-NEXT: mov v5.h[6], w9 -; NONEON-NOSVE-NEXT: mov v4.h[6], w8 -; NONEON-NOSVE-NEXT: udiv w12, w15, w13 -; NONEON-NOSVE-NEXT: msub w8, w18, w20, w23 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #96] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[7], w8 -; NONEON-NOSVE-NEXT: msub w9, w12, w13, w15 -; NONEON-NOSVE-NEXT: mov v5.h[7], w9 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #144 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -2171,19 +1864,20 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w11, v1.s[1] -; NONEON-NOSVE-NEXT: mov w12, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = urem <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -2203,26 +1897,28 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov w11, s1 -; NONEON-NOSVE-NEXT: fmov w12, s0 -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: mov w14, v1.s[2] -; NONEON-NOSVE-NEXT: mov w15, v0.s[2] -; NONEON-NOSVE-NEXT: mov w17, v1.s[3] -; NONEON-NOSVE-NEXT: mov w18, v0.s[3] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: udiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.s[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = urem <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -2246,61 +1942,50 @@ define void @urem_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: urem_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -48 -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov w12, s0 -; NONEON-NOSVE-NEXT: fmov w3, s2 -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w11, s1 -; NONEON-NOSVE-NEXT: fmov w2, s3 -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w17, v3.s[1] -; NONEON-NOSVE-NEXT: mov w18, v2.s[1] -; NONEON-NOSVE-NEXT: mov w14, v1.s[2] -; NONEON-NOSVE-NEXT: mov w15, v0.s[2] -; NONEON-NOSVE-NEXT: mov w5, v3.s[2] -; NONEON-NOSVE-NEXT: mov w6, v2.s[2] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 -; NONEON-NOSVE-NEXT: mov w19, v3.s[3] -; NONEON-NOSVE-NEXT: mov w20, v2.s[3] -; NONEON-NOSVE-NEXT: mov w22, v1.s[3] -; NONEON-NOSVE-NEXT: mov w23, v0.s[3] -; NONEON-NOSVE-NEXT: udiv w4, w3, w2 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s1, w11 -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w12, w4, w2, w3 -; NONEON-NOSVE-NEXT: fmov s0, w12 -; NONEON-NOSVE-NEXT: udiv w1, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v1.s[1], w8 -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 -; NONEON-NOSVE-NEXT: msub w13, w1, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.s[1], w13 -; NONEON-NOSVE-NEXT: udiv w7, w6, w5 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v1.s[2], w8 -; NONEON-NOSVE-NEXT: udiv w21, w20, w19 -; NONEON-NOSVE-NEXT: msub w10, w7, w5, w6 -; NONEON-NOSVE-NEXT: mov v0.s[2], w10 -; NONEON-NOSVE-NEXT: udiv w9, w23, w22 -; NONEON-NOSVE-NEXT: msub w10, w21, w19, w20 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v0.s[3], w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w22, w23 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #36] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #4] +; NONEON-NOSVE-NEXT: udiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -2323,13 +2008,15 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: udiv x10, x9, x8 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = urem <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -2349,16 +2036,19 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x11, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv x10, x11, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv x10, x9, x8 -; NONEON-NOSVE-NEXT: udiv x13, x12, x11 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = urem <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -2382,29 +2072,33 @@ define void @urem_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: urem_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: fmov x15, d2 -; NONEON-NOSVE-NEXT: mov x12, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x14, d3 -; NONEON-NOSVE-NEXT: mov x11, v3.d[1] -; NONEON-NOSVE-NEXT: mov x17, v1.d[1] -; NONEON-NOSVE-NEXT: mov x18, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv x10, x11, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x11, x10, x8, x9 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: udiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv x10, x9, x8 -; NONEON-NOSVE-NEXT: udiv x16, x15, x14 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: udiv x13, x12, x11 -; NONEON-NOSVE-NEXT: msub x10, x16, x14, x15 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: udiv x1, x18, x17 -; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 -; NONEON-NOSVE-NEXT: msub x11, x1, x17, x18 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll index 0108fb580b947b..5cee1360f6f3cf 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll @@ -20,10 +20,28 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2 ret <4 x i8> %sel @@ -43,10 +61,44 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.8b, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2 ret <8 x i8> %sel @@ -66,10 +118,75 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v16i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.16b, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2 ret <16 x i8> %sel @@ -92,16 +209,147 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v32i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.16b, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #63] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #62] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #61] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #60] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #59] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #58] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #57] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #55] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #54] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #53] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #52] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #51] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #50] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #49] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <32 x i8>, ptr %a %op2 = load volatile <32 x i8>, ptr %b @@ -125,10 +373,18 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w11, w10, w8, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2 ret <2 x i16> %sel @@ -149,10 +405,28 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2 ret <4 x i16> %sel @@ -173,10 +447,43 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.8h, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2 ret <8 x i16> %sel @@ -200,16 +507,83 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #62] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #60] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #50] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <16 x i16>, ptr %a %op2 = load volatile <16 x i16>, ptr %b @@ -233,10 +607,18 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w11, w10, w8, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2 ret <2 x i32> %sel @@ -257,10 +639,23 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4s, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: csel w11, w10, w8, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w11, w10, w8, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2 ret <4 x i32> %sel @@ -284,16 +679,43 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: csel w11, w8, w10, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w11, w8, w10, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: csel w11, w8, w10, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: csel w11, w8, w10, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <8 x i32>, ptr %a %op2 = load volatile <8 x i32>, ptr %b @@ -318,10 +740,15 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: fmov d2, x8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: csel x8, x9, x8, ne +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2 ret <1 x i64> %sel @@ -343,10 +770,17 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: dup v2.2d, x8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: csel x11, x10, x8, ne +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csel x8, x9, x8, ne +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2 ret <2 x i64> %sel @@ -371,16 +805,31 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: csel x11, x8, x10, ne +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csel x8, x8, x9, ne +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: csel x11, x8, x10, ne +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: csel x8, x8, x9, ne +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <4 x i64>, ptr %a %op2 = load volatile <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll index f7198e3042ad53..2778e93416a748 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll @@ -23,12 +23,27 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h -; NONEON-NOSVE-NEXT: sshl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #10] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #8] +; NONEON-NOSVE-NEXT: asr w10, w11, w10 +; NONEON-NOSVE-NEXT: asr w11, w13, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: asr w8, w14, w9 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: strh w11, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -46,8 +61,43 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.8b, v1.8b -; NONEON-NOSVE-NEXT: sshl v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -65,8 +115,74 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b -; NONEON-NOSVE-NEXT: sshl v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = ashr <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -86,13 +202,143 @@ define void @ashr_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ashr_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.16b, v0.16b -; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b -; NONEON-NOSVE-NEXT: sshl v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: sshl v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -115,12 +361,18 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s -; NONEON-NOSVE-NEXT: sshl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #8] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: asr w9, w11, w10 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -138,8 +390,27 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h -; NONEON-NOSVE-NEXT: sshl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -157,8 +428,42 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h -; NONEON-NOSVE-NEXT: sshl v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = ashr <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -178,13 +483,79 @@ define void @ashr_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ashr_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.8h, v0.8h -; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h -; NONEON-NOSVE-NEXT: sshl v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: sshl v1.8h, v3.8h, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -205,8 +576,17 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s -; NONEON-NOSVE-NEXT: sshl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -224,8 +604,22 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: sshl v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = ashr <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -245,13 +639,39 @@ define void @ashr_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ashr_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: sshl v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: sshl v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -272,8 +692,14 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg d1, d1 -; NONEON-NOSVE-NEXT: sshl d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: asr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = ashr <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -291,8 +717,16 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d -; NONEON-NOSVE-NEXT: sshl v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: asr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: asr x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = ashr <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -312,13 +746,27 @@ define void @ashr_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ashr_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d -; NONEON-NOSVE-NEXT: sshl v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: sshl v1.2d, v3.2d, v1.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: asr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: asr x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: asr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: asr x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -345,11 +793,27 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h -; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #10] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w10, w11, w10 +; NONEON-NOSVE-NEXT: lsr w11, w13, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w14, w9 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: strh w11, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -367,8 +831,43 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.8b, v1.8b -; NONEON-NOSVE-NEXT: ushl v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -386,8 +885,74 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b -; NONEON-NOSVE-NEXT: ushl v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = lshr <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -407,13 +972,143 @@ define void @lshr_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: lshr_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.16b, v0.16b -; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b -; NONEON-NOSVE-NEXT: ushl v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: ushl v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -436,11 +1131,18 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s -; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: lsr w9, w11, w10 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -458,8 +1160,27 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h -; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -477,8 +1198,42 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h -; NONEON-NOSVE-NEXT: ushl v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = lshr <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -498,13 +1253,79 @@ define void @lshr_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: lshr_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.8h, v0.8h -; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h -; NONEON-NOSVE-NEXT: ushl v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: ushl v1.8h, v3.8h, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -525,8 +1346,17 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s -; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -544,8 +1374,22 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ushl v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = lshr <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -565,13 +1409,39 @@ define void @lshr_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: lshr_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ushl v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: ushl v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -592,8 +1462,14 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg d1, d1 -; NONEON-NOSVE-NEXT: ushl d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: lsr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = lshr <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -611,8 +1487,16 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ushl v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = lshr <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -632,13 +1516,27 @@ define void @lshr_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: lshr_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ushl v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: ushl v1.2d, v3.2d, v1.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: lsr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: lsr x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -664,9 +1562,18 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v2i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x0000ff000000ff -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w11, w10, w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <2 x i8> %op1, %op2 ret <2 x i8> %res @@ -685,9 +1592,27 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w11, w12, w11 +; NONEON-NOSVE-NEXT: strh w11, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w10, w11, w10 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w9, w10, w9 +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -705,7 +1630,43 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -723,7 +1684,74 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = shl <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -743,11 +1771,143 @@ define void @shl_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shl_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: ushl v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: ushl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -768,7 +1928,27 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -786,7 +1966,42 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = shl <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -806,11 +2021,79 @@ define void @shl_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shl_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: ushl v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: ushl v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -831,7 +2114,17 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -849,7 +2142,22 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = shl <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -869,11 +2177,39 @@ define void @shl_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shl_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: ushl v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: ushl v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -894,7 +2230,14 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: lsl x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = shl <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -912,7 +2255,16 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = shl <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -932,11 +2284,27 @@ define void @shl_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shl_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: ushl v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: ushl v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: lsl x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: lsl x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 42d3b9d8f71f86..fd2d9a8fb80d17 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -19,9 +19,26 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i16> %op1 to <4 x half> ret <4 x half> %res @@ -39,17 +56,43 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ushll v1.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v1.4s -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = uitofp <8 x i16> %op1 to <8 x half> @@ -69,25 +112,76 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ushll v2.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ucvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v1.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v3.4s -; NONEON-NOSVE-NEXT: stp q2, q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x half> @@ -111,9 +205,15 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ucvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i16> %op1 to <2 x float> ret <2 x float> %res @@ -131,8 +231,21 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i16> %op1 to <4 x float> ret <4 x float> %res @@ -154,15 +267,33 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = uitofp <8 x i16> %op1 to <8 x float> @@ -192,21 +323,57 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: ucvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x float> @@ -229,9 +396,13 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w8, v0.h[0] -; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <1 x i16> %op1 to <1 x double> ret <1 x double> %res @@ -250,10 +421,16 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i16> %op1 to <2 x double> ret <2 x double> %res @@ -275,17 +452,31 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %res = uitofp <4 x i16> %op1 to <4 x double> @@ -318,26 +509,53 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q2, [x1] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #92] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #88] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #144] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #84] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #80] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #128] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #72] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #68] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = uitofp <8 x i16> %op1 to <8 x double> @@ -390,42 +608,99 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #72] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] -; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ushll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: ushll v7.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v5.2d, v5.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: ucvtf v4.2d, v4.2d -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v7.2d -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v6.2d -; NONEON-NOSVE-NEXT: stp q2, q0, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #152] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #136] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: str d1, [sp, #328] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #164] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #160] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #156] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #152] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #148] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #144] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #140] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #136] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #332] +; NONEON-NOSVE-NEXT: ldp q4, q3, [sp, #192] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #328] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #304] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #188] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #184] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #288] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #180] +; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #288] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #176] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #272] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #172] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #168] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #256] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224] +; NONEON-NOSVE-NEXT: ldp q2, q5, [sp, #256] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x double> @@ -449,9 +724,18 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: str wzr, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x half> ret <2 x half> %res @@ -469,8 +753,24 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i32> %op1 to <4 x half> ret <4 x half> %res @@ -492,11 +792,39 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x half> @@ -525,17 +853,72 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v16i32_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ucvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v1.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i32>, ptr %a %res = uitofp <16 x i32> %op1 to <16 x half> @@ -558,7 +941,14 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x float> ret <2 x float> %res @@ -575,7 +965,18 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i32> %op1 to <4 x float> ret <4 x float> %res @@ -593,10 +994,28 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x float> @@ -620,8 +1039,16 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x double> ret <2 x double> %res @@ -643,15 +1070,23 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = uitofp <4 x i32> %op1 to <4 x double> @@ -681,21 +1116,37 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x double> @@ -726,14 +1177,17 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] ; NONEON-NOSVE-NEXT: ucvtf s0, x8 -; NONEON-NOSVE-NEXT: fcvt h2, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x half> ret <2 x half> %res @@ -758,12 +1212,25 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x half> @@ -801,18 +1268,43 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn v2.2s, v2.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d -; NONEON-NOSVE-NEXT: fcvtn2 v2.4s, v3.2d -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v2.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i64>, ptr %a %res = uitofp <8 x i64> %op1 to <8 x half> @@ -835,8 +1327,14 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res @@ -858,11 +1356,19 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x float> @@ -891,17 +1397,32 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn v1.2s, v1.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v2.2d -; NONEON-NOSVE-NEXT: fcvtn2 v1.4s, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i64>, ptr %a %res = uitofp <8 x i64> %op1 to <8 x float> @@ -924,7 +1445,14 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: ucvtf d1, x9 +; NONEON-NOSVE-NEXT: ucvtf d0, x8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x double> ret <2 x double> %res @@ -942,10 +1470,20 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf d1, x9 +; NONEON-NOSVE-NEXT: ucvtf d0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf d1, x9 +; NONEON-NOSVE-NEXT: ucvtf d0, x8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x double> @@ -968,9 +1506,26 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i16> %op1 to <4 x half> ret <4 x half> %res @@ -988,17 +1543,43 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: sshll v1.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v1.4s -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = sitofp <8 x i16> %op1 to <8 x half> @@ -1018,25 +1599,76 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: sshll v2.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: scvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v1.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v3.4s -; NONEON-NOSVE-NEXT: stp q2, q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x half> @@ -1059,9 +1691,15 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: scvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i16> %op1 to <2 x float> ret <2 x float> %res @@ -1079,8 +1717,21 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i16> %op1 to <4 x float> ret <4 x float> %res @@ -1102,15 +1753,33 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = sitofp <8 x i16> %op1 to <8 x float> @@ -1140,21 +1809,57 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: scvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: scvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #46] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #44] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #42] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #38] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #36] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #34] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #32] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x float> @@ -1180,10 +1885,16 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i16> %op1 to <2 x double> ret <2 x double> %res @@ -1205,17 +1916,29 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %res = sitofp <4 x i16> %op1 to <4 x double> @@ -1248,26 +1971,49 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q2, [x1] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #144] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #128] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = sitofp <8 x i16> %op1 to <8 x double> @@ -1320,42 +2066,92 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #72] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: sshll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: sshll v7.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: scvtf v5.2d, v5.2d -; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: scvtf v4.2d, v4.2d -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: scvtf v0.2d, v7.2d -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: scvtf v1.2d, v6.2d -; NONEON-NOSVE-NEXT: stp q2, q0, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #152] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #136] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: str d1, [sp, #328] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #332] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192] +; NONEON-NOSVE-NEXT: scvtf d1, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldp q4, q3, [sp, #192] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #304] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #288] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #288] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #272] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #256] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224] +; NONEON-NOSVE-NEXT: ldp q2, q5, [sp, #256] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x double> @@ -1379,9 +2175,18 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: str wzr, [sp, #12] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x half> ret <2 x half> %res @@ -1399,8 +2204,24 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i32> %op1 to <4 x half> ret <4 x half> %res @@ -1422,11 +2243,39 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x half> @@ -1448,7 +2297,14 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x float> ret <2 x float> %res @@ -1465,7 +2321,18 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i32> %op1 to <4 x float> ret <4 x float> %res @@ -1483,10 +2350,28 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x float> @@ -1510,8 +2395,15 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x double> ret <2 x double> %res @@ -1533,15 +2425,21 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = sitofp <4 x i32> %op1 to <4 x double> @@ -1571,21 +2469,33 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x double> @@ -1634,36 +2544,68 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 272 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #256] // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] -; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #-64]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 -; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #32] -; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: ldr d7, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: sshll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v7.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v4.2d, v4.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: scvtf v5.2d, v5.2d -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q2, q4, [x1, #96] -; NONEON-NOSVE-NEXT: scvtf v2.2d, v6.2d -; NONEON-NOSVE-NEXT: stp q3, q5, [x1, #64] -; NONEON-NOSVE-NEXT: scvtf v3.2d, v7.2d -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #64 +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: str q3, [sp, #64] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: str d0, [sp, #264] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #16] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #88] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: scvtf d2, w9 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: scvtf d0, w9 +; NONEON-NOSVE-NEXT: str d0, [sp, #152] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #268] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: scvtf d1, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldp q4, q6, [sp, #208] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldr q7, [sp, #240] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #160] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldr q5, [sp, #160] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #176] +; NONEON-NOSVE-NEXT: stp q7, q6, [x1, #64] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: stp q4, q3, [x1, #32] +; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i32>, ptr %a %res = sitofp <16 x i32> %op1 to <16 x double> @@ -1694,14 +2636,17 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: scvtf s1, x9 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: scvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] ; NONEON-NOSVE-NEXT: scvtf s0, x8 -; NONEON-NOSVE-NEXT: fcvt h2, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x half> ret <2 x half> %res @@ -1726,12 +2671,25 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: scvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: scvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x half> @@ -1754,8 +2712,14 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: scvtf s1, x9 +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res @@ -1777,11 +2741,19 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: scvtf s1, x9 +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s1, x9 +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x float> @@ -1803,7 +2775,14 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: scvtf d1, x9 +; NONEON-NOSVE-NEXT: scvtf d0, x8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x double> ret <2 x double> %res @@ -1821,10 +2800,20 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: scvtf d1, x9 +; NONEON-NOSVE-NEXT: scvtf d0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf d1, x9 +; NONEON-NOSVE-NEXT: scvtf d0, x8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x double> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 250929df6b3c35..af15d5f67ad15c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -22,9 +22,40 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #18] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: csel w9, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #4] +; NONEON-NOSVE-NEXT: tst w11, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2] +; NONEON-NOSVE-NEXT: csel w9, w12, w9, ne +; NONEON-NOSVE-NEXT: tst w10, #0xffff +; NONEON-NOSVE-NEXT: ldrh w10, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csel w9, w11, w9, ne +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2 ret <4 x i8> %sel @@ -47,9 +78,68 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.8b, v2.8b, #7 -; NONEON-NOSVE-NEXT: cmlt v2.8b, v2.8b, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #7] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #21] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #20] +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #18] +; NONEON-NOSVE-NEXT: tst w13, #0xff +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #17] +; NONEON-NOSVE-NEXT: csel w13, w17, w16, ne +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: tst w15, #0xff +; NONEON-NOSVE-NEXT: strb w13, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #5] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: csel w13, w16, w13, ne +; NONEON-NOSVE-NEXT: tst w14, #0xff +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #4] +; NONEON-NOSVE-NEXT: strb w13, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w13, w15, w13, ne +; NONEON-NOSVE-NEXT: tst w12, #0xff +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: strb w13, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: csel w12, w14, w13, ne +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: tst w11, #0xff +; NONEON-NOSVE-NEXT: strb w12, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: csel w11, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #2] +; NONEON-NOSVE-NEXT: tst w10, #0xff +; NONEON-NOSVE-NEXT: strb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: csel w10, w12, w11, ne +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #1] +; NONEON-NOSVE-NEXT: tst w9, #0xff +; NONEON-NOSVE-NEXT: strb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #9] +; NONEON-NOSVE-NEXT: csel w9, w11, w10, ne +; NONEON-NOSVE-NEXT: ldrb w10, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xff +; NONEON-NOSVE-NEXT: strb w9, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2 ret <8 x i8> %sel @@ -72,9 +162,124 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) ; ; NONEON-NOSVE-LABEL: select_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.16b, v2.16b, #7 -; NONEON-NOSVE-NEXT: cmlt v2.16b, v2.16b, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #47] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #45] +; NONEON-NOSVE-NEXT: sbfx w2, w2, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w4, w4, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #44] +; NONEON-NOSVE-NEXT: sbfx w3, w3, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #42] +; NONEON-NOSVE-NEXT: tst w2, #0xff +; NONEON-NOSVE-NEXT: sbfx w1, w1, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #41] +; NONEON-NOSVE-NEXT: csel w2, w6, w5, ne +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #14] +; NONEON-NOSVE-NEXT: tst w4, #0xff +; NONEON-NOSVE-NEXT: strb w2, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #13] +; NONEON-NOSVE-NEXT: sbfx w0, w0, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w18, w18, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w17, w17, #0, #1 +; NONEON-NOSVE-NEXT: csel w2, w5, w2, ne +; NONEON-NOSVE-NEXT: tst w3, #0xff +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #12] +; NONEON-NOSVE-NEXT: strb w2, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #37] +; NONEON-NOSVE-NEXT: csel w2, w4, w2, ne +; NONEON-NOSVE-NEXT: tst w1, #0xff +; NONEON-NOSVE-NEXT: sbfx w16, w16, #0, #1 +; NONEON-NOSVE-NEXT: strb w2, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #28] +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #36] +; NONEON-NOSVE-NEXT: csel w1, w3, w2, ne +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #11] +; NONEON-NOSVE-NEXT: tst w0, #0xff +; NONEON-NOSVE-NEXT: strb w1, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #27] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: csel w0, w2, w1, ne +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #10] +; NONEON-NOSVE-NEXT: tst w18, #0xff +; NONEON-NOSVE-NEXT: strb w0, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #26] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: csel w18, w1, w0, ne +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #9] +; NONEON-NOSVE-NEXT: tst w17, #0xff +; NONEON-NOSVE-NEXT: strb w18, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #25] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: csel w17, w0, w18, ne +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #8] +; NONEON-NOSVE-NEXT: tst w16, #0xff +; NONEON-NOSVE-NEXT: strb w17, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #24] +; NONEON-NOSVE-NEXT: csel w16, w18, w17, ne +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #7] +; NONEON-NOSVE-NEXT: tst w15, #0xff +; NONEON-NOSVE-NEXT: strb w16, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #23] +; NONEON-NOSVE-NEXT: csel w15, w17, w16, ne +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: tst w14, #0xff +; NONEON-NOSVE-NEXT: strb w15, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #22] +; NONEON-NOSVE-NEXT: csel w14, w16, w15, ne +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #5] +; NONEON-NOSVE-NEXT: tst w13, #0xff +; NONEON-NOSVE-NEXT: strb w14, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #21] +; NONEON-NOSVE-NEXT: csel w13, w15, w14, ne +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #4] +; NONEON-NOSVE-NEXT: tst w12, #0xff +; NONEON-NOSVE-NEXT: strb w13, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #20] +; NONEON-NOSVE-NEXT: csel w12, w14, w13, ne +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: tst w11, #0xff +; NONEON-NOSVE-NEXT: strb w12, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #19] +; NONEON-NOSVE-NEXT: csel w11, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #2] +; NONEON-NOSVE-NEXT: tst w10, #0xff +; NONEON-NOSVE-NEXT: strb w11, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #18] +; NONEON-NOSVE-NEXT: csel w10, w12, w11, ne +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #1] +; NONEON-NOSVE-NEXT: tst w9, #0xff +; NONEON-NOSVE-NEXT: strb w10, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #17] +; NONEON-NOSVE-NEXT: csel w9, w11, w10, ne +; NONEON-NOSVE-NEXT: ldrb w10, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xff +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2 ret <16 x i8> %sel @@ -95,14 +300,204 @@ define void @select_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: cmeq v4.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: cmeq v5.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #208 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #112] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 208 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, eq +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #37] +; NONEON-NOSVE-NEXT: csel w9, w11, w10, eq +; NONEON-NOSVE-NEXT: cmp w13, w12 +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #21] +; NONEON-NOSVE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: csel w8, w13, w12, eq +; NONEON-NOSVE-NEXT: cmp w16, w14 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #22] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #4] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: csel w8, w16, w14, eq +; NONEON-NOSVE-NEXT: cmp w1, w18 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #23] +; NONEON-NOSVE-NEXT: csel w12, w1, w18, eq +; NONEON-NOSVE-NEXT: cmp w2, w13 +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #24] +; NONEON-NOSVE-NEXT: csel w13, w2, w13, eq +; NONEON-NOSVE-NEXT: cmp w16, w14 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #25] +; NONEON-NOSVE-NEXT: csel w14, w16, w14, eq +; NONEON-NOSVE-NEXT: cmp w1, w18 +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #26] +; NONEON-NOSVE-NEXT: csel w16, w1, w18, eq +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #42] +; NONEON-NOSVE-NEXT: cmp w5, w2 +; NONEON-NOSVE-NEXT: csel w18, w5, w2, eq +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w6, w1 +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #45] +; NONEON-NOSVE-NEXT: csel w1, w6, w1, eq +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #44] +; NONEON-NOSVE-NEXT: cmp w5, w2 +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #29] +; NONEON-NOSVE-NEXT: str w8, [sp] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: csel w2, w5, w2, eq +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w19, w6 +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #31] +; NONEON-NOSVE-NEXT: csel w5, w19, w6, eq +; NONEON-NOSVE-NEXT: cmp w30, w29 +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #48] +; NONEON-NOSVE-NEXT: csel w6, w30, w29, eq +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #49] +; NONEON-NOSVE-NEXT: csel w19, w8, w9, eq +; NONEON-NOSVE-NEXT: cmp w10, w21 +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: csel w21, w10, w21, eq +; NONEON-NOSVE-NEXT: cmp w11, w22 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #51] +; NONEON-NOSVE-NEXT: csel w22, w11, w22, eq +; NONEON-NOSVE-NEXT: cmp w29, w28 +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #52] +; NONEON-NOSVE-NEXT: csel w11, w29, w28, eq +; NONEON-NOSVE-NEXT: cmp w8, w27 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #53] +; NONEON-NOSVE-NEXT: csel w8, w8, w27, eq +; NONEON-NOSVE-NEXT: cmp w9, w26 +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #54] +; NONEON-NOSVE-NEXT: csel w9, w9, w26, eq +; NONEON-NOSVE-NEXT: cmp w10, w25 +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #55] +; NONEON-NOSVE-NEXT: csel w10, w10, w25, eq +; NONEON-NOSVE-NEXT: cmp w28, w24 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #56] +; NONEON-NOSVE-NEXT: csel w24, w28, w24, eq +; NONEON-NOSVE-NEXT: cmp w27, w23 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #57] +; NONEON-NOSVE-NEXT: csel w23, w27, w23, eq +; NONEON-NOSVE-NEXT: cmp w26, w20 +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #58] +; NONEON-NOSVE-NEXT: csel w20, w26, w20, eq +; NONEON-NOSVE-NEXT: cmp w25, w7 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #59] +; NONEON-NOSVE-NEXT: csel w7, w25, w7, eq +; NONEON-NOSVE-NEXT: cmp w28, w4 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #60] +; NONEON-NOSVE-NEXT: csel w4, w28, w4, eq +; NONEON-NOSVE-NEXT: cmp w27, w3 +; NONEON-NOSVE-NEXT: csel w3, w27, w3, eq +; NONEON-NOSVE-NEXT: cmp w26, w17 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #61] +; NONEON-NOSVE-NEXT: csel w17, w26, w17, eq +; NONEON-NOSVE-NEXT: cmp w25, w15 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #78] +; NONEON-NOSVE-NEXT: csel w15, w25, w15, eq +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w27, w28 +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #63] +; NONEON-NOSVE-NEXT: strb w9, [sp, #99] +; NONEON-NOSVE-NEXT: csel w27, w27, w28, eq +; NONEON-NOSVE-NEXT: cmp w25, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #98] +; NONEON-NOSVE-NEXT: csel w25, w25, w26, eq +; NONEON-NOSVE-NEXT: cmp w30, w29 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: csel w26, w30, w29, eq +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #16] +; NONEON-NOSVE-NEXT: strb w26, [sp, #111] +; NONEON-NOSVE-NEXT: strb w9, [sp, #84] +; NONEON-NOSVE-NEXT: cmp w29, w28 +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w25, [sp, #110] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w27, [sp, #109] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: csel w8, w29, w28, eq +; NONEON-NOSVE-NEXT: strb w15, [sp, #108] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w17, [sp, #107] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #112] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w3, [sp, #106] +; NONEON-NOSVE-NEXT: strb w4, [sp, #105] +; NONEON-NOSVE-NEXT: strb w7, [sp, #104] +; NONEON-NOSVE-NEXT: strb w20, [sp, #103] +; NONEON-NOSVE-NEXT: strb w23, [sp, #102] +; NONEON-NOSVE-NEXT: strb w24, [sp, #101] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w10, [sp, #100] +; NONEON-NOSVE-NEXT: strb w11, [sp, #97] +; NONEON-NOSVE-NEXT: strb w22, [sp, #96] +; NONEON-NOSVE-NEXT: strb w21, [sp, #95] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w19, [sp, #94] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w6, [sp, #93] +; NONEON-NOSVE-NEXT: strb w5, [sp, #92] +; NONEON-NOSVE-NEXT: strb w2, [sp, #91] +; NONEON-NOSVE-NEXT: strb w1, [sp, #90] +; NONEON-NOSVE-NEXT: strb w18, [sp, #89] +; NONEON-NOSVE-NEXT: strb w16, [sp, #88] +; NONEON-NOSVE-NEXT: strb w14, [sp, #87] +; NONEON-NOSVE-NEXT: strb w13, [sp, #86] +; NONEON-NOSVE-NEXT: strb w12, [sp, #85] +; NONEON-NOSVE-NEXT: strb w9, [sp, #82] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #80] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #208 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -129,9 +524,25 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.2s, v2.2s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.2s, v2.2s, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #4] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: csel w8, w11, w10, ne +; NONEON-NOSVE-NEXT: ldr w10, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w8, ne +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2 ret <2 x i16> %sel @@ -154,9 +565,40 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #18] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: csel w9, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #4] +; NONEON-NOSVE-NEXT: tst w11, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2] +; NONEON-NOSVE-NEXT: csel w9, w12, w9, ne +; NONEON-NOSVE-NEXT: tst w10, #0xffff +; NONEON-NOSVE-NEXT: ldrh w10, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csel w9, w11, w9, ne +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2 ret <4 x i16> %sel @@ -180,10 +622,68 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: shl v2.8h, v2.8h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.8h, v2.8h, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #47] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #45] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: tst w13, #0xffff +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: csel w13, w17, w16, ne +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #12] +; NONEON-NOSVE-NEXT: tst w15, #0xffff +; NONEON-NOSVE-NEXT: strh w13, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #10] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: csel w13, w16, w13, ne +; NONEON-NOSVE-NEXT: tst w14, #0xffff +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #8] +; NONEON-NOSVE-NEXT: strh w13, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: csel w13, w15, w13, ne +; NONEON-NOSVE-NEXT: tst w12, #0xffff +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: strh w13, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #24] +; NONEON-NOSVE-NEXT: csel w12, w14, w13, ne +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #6] +; NONEON-NOSVE-NEXT: tst w11, #0xffff +; NONEON-NOSVE-NEXT: strh w12, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #22] +; NONEON-NOSVE-NEXT: csel w11, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #4] +; NONEON-NOSVE-NEXT: tst w10, #0xffff +; NONEON-NOSVE-NEXT: strh w11, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #20] +; NONEON-NOSVE-NEXT: csel w10, w12, w11, ne +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: strh w10, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #18] +; NONEON-NOSVE-NEXT: csel w9, w11, w10, ne +; NONEON-NOSVE-NEXT: ldrh w10, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2 ret <8 x i16> %sel @@ -204,14 +704,98 @@ define void @select_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: cmeq v4.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: cmeq v5.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #112 +; NONEON-NOSVE-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 112 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -16 +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, eq +; NONEON-NOSVE-NEXT: cmp w13, w12 +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #12] +; NONEON-NOSVE-NEXT: csel w9, w13, w12, eq +; NONEON-NOSVE-NEXT: cmp w15, w14 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #10] +; NONEON-NOSVE-NEXT: csel w14, w15, w14, eq +; NONEON-NOSVE-NEXT: cmp w17, w16 +; NONEON-NOSVE-NEXT: csel w16, w17, w16, eq +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #28] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: cmp w13, w12 +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: csel w12, w13, w12, eq +; NONEON-NOSVE-NEXT: cmp w1, w17 +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #32] +; NONEON-NOSVE-NEXT: csel w17, w1, w17, eq +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w4, w3 +; NONEON-NOSVE-NEXT: ldrh w6, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #52] +; NONEON-NOSVE-NEXT: csel w3, w4, w3, eq +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #36] +; NONEON-NOSVE-NEXT: cmp w5, w1 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #38] +; NONEON-NOSVE-NEXT: csel w1, w5, w1, eq +; NONEON-NOSVE-NEXT: cmp w7, w6 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #40] +; NONEON-NOSVE-NEXT: csel w6, w7, w6, eq +; NONEON-NOSVE-NEXT: cmp w4, w2 +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #42] +; NONEON-NOSVE-NEXT: csel w2, w4, w2, eq +; NONEON-NOSVE-NEXT: cmp w19, w13 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #44] +; NONEON-NOSVE-NEXT: csel w13, w19, w13, eq +; NONEON-NOSVE-NEXT: cmp w5, w18 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #46] +; NONEON-NOSVE-NEXT: csel w18, w5, w18, eq +; NONEON-NOSVE-NEXT: cmp w7, w15 +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: csel w15, w7, w15, eq +; NONEON-NOSVE-NEXT: cmp w4, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: csel w11, w4, w11, eq +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w19, w10 +; NONEON-NOSVE-NEXT: csel w10, w19, w10, eq +; NONEON-NOSVE-NEXT: strh w11, [sp, #92] +; NONEON-NOSVE-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: cmp w5, w4 +; NONEON-NOSVE-NEXT: strh w10, [sp, #94] +; NONEON-NOSVE-NEXT: csel w8, w5, w4, eq +; NONEON-NOSVE-NEXT: strh w15, [sp, #90] +; NONEON-NOSVE-NEXT: strh w18, [sp, #88] +; NONEON-NOSVE-NEXT: strh w13, [sp, #86] +; NONEON-NOSVE-NEXT: strh w2, [sp, #84] +; NONEON-NOSVE-NEXT: strh w6, [sp, #82] +; NONEON-NOSVE-NEXT: strh w1, [sp, #80] +; NONEON-NOSVE-NEXT: strh w3, [sp, #78] +; NONEON-NOSVE-NEXT: strh w17, [sp, #76] +; NONEON-NOSVE-NEXT: strh w12, [sp, #74] +; NONEON-NOSVE-NEXT: strh w16, [sp, #72] +; NONEON-NOSVE-NEXT: strh w14, [sp, #70] +; NONEON-NOSVE-NEXT: strh w9, [sp, #68] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #112 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -238,9 +822,25 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.2s, v2.2s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.2s, v2.2s, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #4] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: csel w8, w11, w10, ne +; NONEON-NOSVE-NEXT: ldr w10, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w8, ne +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2 ret <2 x i32> %sel @@ -264,10 +864,40 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: shl v2.4s, v2.4s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.4s, v2.4s, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #42] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: csel w9, w13, w12, ne +; NONEON-NOSVE-NEXT: ldr w12, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w11, #0 +; NONEON-NOSVE-NEXT: str w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #4] +; NONEON-NOSVE-NEXT: csel w9, w12, w9, ne +; NONEON-NOSVE-NEXT: cmp w10, #0 +; NONEON-NOSVE-NEXT: ldr w10, [sp] +; NONEON-NOSVE-NEXT: str w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: csel w9, w11, w9, ne +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: str w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2 ret <4 x i32> %sel @@ -288,14 +918,43 @@ define void @select_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: cmeq v4.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: cmeq v5.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w10, w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldp w12, w11, [sp, #24] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w10, w9 +; NONEON-NOSVE-NEXT: csel w9, w10, w9, eq +; NONEON-NOSVE-NEXT: cmp w13, w12 +; NONEON-NOSVE-NEXT: ldp w15, w16, [sp, #48] +; NONEON-NOSVE-NEXT: csel w12, w13, w12, eq +; NONEON-NOSVE-NEXT: cmp w14, w11 +; NONEON-NOSVE-NEXT: ldp w10, w13, [sp, #32] +; NONEON-NOSVE-NEXT: csel w11, w14, w11, eq +; NONEON-NOSVE-NEXT: ldp w17, w14, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w18, w1, [sp, #40] +; NONEON-NOSVE-NEXT: cmp w10, w15 +; NONEON-NOSVE-NEXT: stp w12, w11, [sp, #72] +; NONEON-NOSVE-NEXT: csel w10, w10, w15, eq +; NONEON-NOSVE-NEXT: cmp w13, w16 +; NONEON-NOSVE-NEXT: ldr w15, [sp] +; NONEON-NOSVE-NEXT: csel w13, w13, w16, eq +; NONEON-NOSVE-NEXT: cmp w18, w17 +; NONEON-NOSVE-NEXT: csel w16, w18, w17, eq +; NONEON-NOSVE-NEXT: cmp w1, w14 +; NONEON-NOSVE-NEXT: stp w10, w13, [sp, #80] +; NONEON-NOSVE-NEXT: csel w10, w1, w14, eq +; NONEON-NOSVE-NEXT: cmp w15, w8 +; NONEON-NOSVE-NEXT: csel w8, w15, w8, eq +; NONEON-NOSVE-NEXT: stp w16, w10, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -321,10 +980,15 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: fmov d2, x8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: csel x8, x9, x8, ne +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2 ret <1 x i64> %sel @@ -348,10 +1012,25 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: shl v2.2d, v2.2d, #63 -; NONEON-NOSVE-NEXT: cmlt v2.2d, v2.2d, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp, #8] +; NONEON-NOSVE-NEXT: sbfx x8, x8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx x9, x9, #0, #1 +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: csel x8, x11, x10, ne +; NONEON-NOSVE-NEXT: ldr x10, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #0 +; NONEON-NOSVE-NEXT: str x8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csel x8, x10, x8, ne +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2 ret <2 x i64> %sel @@ -372,14 +1051,30 @@ define void @select_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: cmeq v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmeq v5.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: ldp x8, x11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x13, [sp, #40] +; NONEON-NOSVE-NEXT: ldp x10, x12, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, eq +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, eq +; NONEON-NOSVE-NEXT: ldr x10, [sp, #16] +; NONEON-NOSVE-NEXT: ldr x11, [sp] +; NONEON-NOSVE-NEXT: cmp x13, x12 +; NONEON-NOSVE-NEXT: csel x12, x13, x12, eq +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: stp x9, x12, [sp, #80] +; NONEON-NOSVE-NEXT: csel x9, x11, x10, eq +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index 0b6152340f65ab..66d544d0acbf56 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -33,19 +33,23 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind { ; ; NONEON-NOSVE-LABEL: alloc_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #32 -; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: mov x19, x0 -; NONEON-NOSVE-NEXT: add x0, sp, #12 +; NONEON-NOSVE-NEXT: add x0, sp, #28 ; NONEON-NOSVE-NEXT: bl def -; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: umov w8, v0.h[2] -; NONEON-NOSVE-NEXT: umov w9, v0.h[0] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] ; NONEON-NOSVE-NEXT: strb w8, [x19, #1] ; NONEON-NOSVE-NEXT: strb w9, [x19] -; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %alloc = alloca [4 x i8] call void @def(ptr %alloc) @@ -88,21 +92,25 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind { ; ; NONEON-NOSVE-LABEL: alloc_v6i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #32 -; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: mov x19, x0 -; NONEON-NOSVE-NEXT: add x0, sp, #8 +; NONEON-NOSVE-NEXT: add x0, sp, #24 ; NONEON-NOSVE-NEXT: bl def -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: add x9, x19, #2 -; NONEON-NOSVE-NEXT: rev16 v1.16b, v0.16b -; NONEON-NOSVE-NEXT: xtn v1.8b, v1.8h -; NONEON-NOSVE-NEXT: str s1, [sp, #4] -; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] -; NONEON-NOSVE-NEXT: st1 { v0.b }[5], [x9] -; NONEON-NOSVE-NEXT: strh w8, [x19] -; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: str x8, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [x19, #2] +; NONEON-NOSVE-NEXT: strh w9, [x19] +; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %alloc = alloca [6 x i8] call void @def(ptr %alloc) @@ -135,18 +143,38 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind { ; ; NONEON-NOSVE-LABEL: alloc_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #48 -; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #112 +; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: mov x19, x0 -; NONEON-NOSVE-NEXT: mov x0, sp +; NONEON-NOSVE-NEXT: add x0, sp, #64 ; NONEON-NOSVE-NEXT: bl def -; NONEON-NOSVE-NEXT: ldp q0, q1, [sp] -; NONEON-NOSVE-NEXT: add x8, x19, #8 -; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h -; NONEON-NOSVE-NEXT: st1 { v1.b }[0], [x8] -; NONEON-NOSVE-NEXT: str d0, [x19] -; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [x19, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [x19] +; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #112 ; NONEON-NOSVE-NEXT: ret %alloc = alloca [32 x i8] call void @def(ptr %alloc) @@ -179,18 +207,26 @@ define void @alloc_v8f64(ptr %st_ptr) nounwind { ; ; NONEON-NOSVE-LABEL: alloc_v8f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #80 -; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #176 +; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #160] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: mov x19, x0 -; NONEON-NOSVE-NEXT: mov x0, sp +; NONEON-NOSVE-NEXT: add x0, sp, #96 ; NONEON-NOSVE-NEXT: bl def -; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #32] -; NONEON-NOSVE-NEXT: ldp q3, q2, [sp] -; NONEON-NOSVE-NEXT: zip1 v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: zip1 v1.2d, v3.2d, v2.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #128] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: stp q1, q0, [x19] -; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: add sp, sp, #80 +; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #176 ; NONEON-NOSVE-NEXT: ret %alloc = alloca [8 x double] call void @def(ptr %alloc) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll index 42c439ca4b38d4..3b83f982b6bfc5 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll @@ -22,15 +22,68 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) { ; ; NONEON-NOSVE-LABEL: test: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q3, q4, [x0] -; NONEON-NOSVE-NEXT: add v2.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v5.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: dup v0.4s, v1.s[2] -; NONEON-NOSVE-NEXT: add v1.4s, v3.4s, v3.4s -; NONEON-NOSVE-NEXT: add v3.4s, v4.4s, v4.4s -; NONEON-NOSVE-NEXT: stp q2, q5, [x0, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #60] +; NONEON-NOSVE-NEXT: str q2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w9, [sp, #124] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: str w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #52] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #116] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #44] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #100] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q4, q2, [sp, #80] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q3, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q4, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret entry: %0 = load <16 x i32>, ptr %arg1, align 256 @@ -59,15 +112,71 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) { ; ; NONEON-NOSVE-LABEL: test2: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q3, q4, [x0] -; NONEON-NOSVE-NEXT: add v2.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: dup v0.2s, v1.s[2] -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: add v3.4s, v3.4s, v3.4s -; NONEON-NOSVE-NEXT: add v4.4s, v4.4s, v4.4s -; NONEON-NOSVE-NEXT: stp q2, q1, [x0, #32] -; NONEON-NOSVE-NEXT: stp q3, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: str q2, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #108] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #100] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q4, q2, [sp, #80] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q3, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q4, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret entry: %0 = load <16 x i32>, ptr %arg1, align 256 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll index 992b667a2eafe1..c97a3c2e721a3d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll @@ -15,9 +15,18 @@ define <4 x i8> @load_v4i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: load_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #3] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #1] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [x0] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %load = load <4 x i8>, ptr %a ret <4 x i8> %load @@ -75,11 +84,14 @@ define <2 x i16> @load_v2i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: load_v2i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #2] +; NONEON-NOSVE-NEXT: str w8, [sp, #12] ; NONEON-NOSVE-NEXT: ldrh w8, [x0] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: add x8, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %load = load <2 x i16>, ptr %a ret <2 x i16> %load @@ -93,7 +105,12 @@ define <2 x half> @load_v2f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: load_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %load = load <2 x half>, ptr %a ret <2 x half> %load diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll index 7abe73f08dfd65..9e1edb817c459a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll @@ -21,10 +21,17 @@ define i8 @andv_v4i8(<4 x i8> %a) { ; ; NONEON-NOSVE-LABEL: andv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a) ret i8 %res @@ -41,11 +48,25 @@ define i8 @andv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: andv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: and w12, w13, w12 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w10, w12, w10 +; NONEON-NOSVE-NEXT: and w8, w8, w14 +; NONEON-NOSVE-NEXT: and w8, w10, w8 ; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a) ret i8 %res @@ -64,13 +85,37 @@ define i8 @andv_v16i8(<16 x i8> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #2] +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: and w11, w14, w13 +; NONEON-NOSVE-NEXT: and w9, w12, w9 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #7] +; NONEON-NOSVE-NEXT: and w10, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: and w9, w9, w16 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #9] +; NONEON-NOSVE-NEXT: and w12, w12, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w13, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #15] +; NONEON-NOSVE-NEXT: and w12, w12, w14 +; NONEON-NOSVE-NEXT: and w8, w8, w11 +; NONEON-NOSVE-NEXT: and w9, w10, w9 +; NONEON-NOSVE-NEXT: and w10, w12, w16 +; NONEON-NOSVE-NEXT: and w8, w8, w15 +; NONEON-NOSVE-NEXT: and w9, w9, w10 +; NONEON-NOSVE-NEXT: and w8, w8, w13 +; NONEON-NOSVE-NEXT: and w0, w9, w8 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a) @@ -90,17 +135,72 @@ define i8 @andv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: andv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: and w9, w11, w10 +; NONEON-NOSVE-NEXT: and w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: and w11, w15, w14 +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: and w9, w10, w11 +; NONEON-NOSVE-NEXT: and w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #6] +; NONEON-NOSVE-NEXT: and w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: and w10, w14, w10 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #7] +; NONEON-NOSVE-NEXT: and w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w9, w10, w11 +; NONEON-NOSVE-NEXT: and w10, w14, w13 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #9] +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w11, w15, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #26] +; NONEON-NOSVE-NEXT: and w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: and w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: and w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #14] +; NONEON-NOSVE-NEXT: and w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #15] +; NONEON-NOSVE-NEXT: and w10, w13, w10 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: and w14, w15, w14 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #30] +; NONEON-NOSVE-NEXT: and w9, w9, w14 +; NONEON-NOSVE-NEXT: and w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #31] +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w10, w10, w12 +; NONEON-NOSVE-NEXT: and w11, w16, w11 +; NONEON-NOSVE-NEXT: and w10, w10, w11 +; NONEON-NOSVE-NEXT: and w11, w17, w13 +; NONEON-NOSVE-NEXT: and w9, w10, w11 ; NONEON-NOSVE-NEXT: and w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op) @@ -118,9 +218,12 @@ define i16 @andv_v2i16(<2 x i16> %a) { ; ; NONEON-NOSVE-LABEL: andv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a) ret i16 %res @@ -137,10 +240,17 @@ define i16 @andv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: andv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a) ret i16 %res @@ -159,11 +269,20 @@ define i16 @andv_v8i16(<8 x i16> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w13, [sp] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: and w12, w13, w12 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w10, w12, w10 +; NONEON-NOSVE-NEXT: and w8, w8, w14 +; NONEON-NOSVE-NEXT: and w8, w10, w8 ; NONEON-NOSVE-NEXT: and w0, w8, w9 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret @@ -184,16 +303,40 @@ define i16 @andv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: andv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: and w9, w11, w10 +; NONEON-NOSVE-NEXT: and w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: and w13, w15, w14 +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: and w9, w12, w13 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #12] +; NONEON-NOSVE-NEXT: and w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #14] +; NONEON-NOSVE-NEXT: and w10, w14, w10 +; NONEON-NOSVE-NEXT: and w11, w15, w11 +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w9, w10, w11 +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w9, w13, w12 ; NONEON-NOSVE-NEXT: and w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op) @@ -211,9 +354,12 @@ define i32 @andv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: andv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a) ret i32 %res @@ -232,12 +378,11 @@ define i32 @andv_v4i32(<4 x i32> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: and w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp], #16 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: and w0, w10, w8 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) ret i32 %res @@ -256,15 +401,20 @@ define i32 @andv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: andv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: and w9, w11, w9 +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: and w10, w14, w12 +; NONEON-NOSVE-NEXT: and w11, w15, w13 +; NONEON-NOSVE-NEXT: and w9, w10, w11 ; NONEON-NOSVE-NEXT: and w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op) @@ -284,10 +434,8 @@ define i64 @andv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: and x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a) ret i64 %res @@ -306,13 +454,13 @@ define i64 @andv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: andv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp], #32 +; NONEON-NOSVE-NEXT: and x8, x10, x8 +; NONEON-NOSVE-NEXT: and x9, x11, x9 +; NONEON-NOSVE-NEXT: and x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op) @@ -334,10 +482,17 @@ define i8 @eorv_v4i8(<4 x i8> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a) ret i8 %res @@ -354,11 +509,25 @@ define i8 @eorv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: eor w12, w13, w12 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w10, w12, w10 +; NONEON-NOSVE-NEXT: eor w8, w8, w14 +; NONEON-NOSVE-NEXT: eor w8, w10, w8 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a) ret i8 %res @@ -377,13 +546,37 @@ define i8 @eorv_v16i8(<16 x i8> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #2] +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: eor w11, w14, w13 +; NONEON-NOSVE-NEXT: eor w9, w12, w9 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #7] +; NONEON-NOSVE-NEXT: eor w10, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: eor w9, w9, w16 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #9] +; NONEON-NOSVE-NEXT: eor w12, w12, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w13, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #15] +; NONEON-NOSVE-NEXT: eor w12, w12, w14 +; NONEON-NOSVE-NEXT: eor w8, w8, w11 +; NONEON-NOSVE-NEXT: eor w9, w10, w9 +; NONEON-NOSVE-NEXT: eor w10, w12, w16 +; NONEON-NOSVE-NEXT: eor w8, w8, w15 +; NONEON-NOSVE-NEXT: eor w9, w9, w10 +; NONEON-NOSVE-NEXT: eor w8, w8, w13 +; NONEON-NOSVE-NEXT: eor w0, w9, w8 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a) @@ -403,17 +596,72 @@ define i8 @eorv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: eorv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: eor w9, w11, w10 +; NONEON-NOSVE-NEXT: eor w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: eor w11, w15, w14 +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: eor w9, w10, w11 +; NONEON-NOSVE-NEXT: eor w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #6] +; NONEON-NOSVE-NEXT: eor w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: eor w10, w14, w10 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #7] +; NONEON-NOSVE-NEXT: eor w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w9, w10, w11 +; NONEON-NOSVE-NEXT: eor w10, w14, w13 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #9] +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w11, w15, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #26] +; NONEON-NOSVE-NEXT: eor w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: eor w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: eor w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #14] +; NONEON-NOSVE-NEXT: eor w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #15] +; NONEON-NOSVE-NEXT: eor w10, w13, w10 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: eor w14, w15, w14 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #30] +; NONEON-NOSVE-NEXT: eor w9, w9, w14 +; NONEON-NOSVE-NEXT: eor w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #31] +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w10, w10, w12 +; NONEON-NOSVE-NEXT: eor w11, w16, w11 +; NONEON-NOSVE-NEXT: eor w10, w10, w11 +; NONEON-NOSVE-NEXT: eor w11, w17, w13 +; NONEON-NOSVE-NEXT: eor w9, w10, w11 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op) @@ -431,9 +679,12 @@ define i16 @eorv_v2i16(<2 x i16> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a) ret i16 %res @@ -450,10 +701,17 @@ define i16 @eorv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a) ret i16 %res @@ -472,11 +730,20 @@ define i16 @eorv_v8i16(<8 x i16> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w13, [sp] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: eor w12, w13, w12 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w10, w12, w10 +; NONEON-NOSVE-NEXT: eor w8, w8, w14 +; NONEON-NOSVE-NEXT: eor w8, w10, w8 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret @@ -497,16 +764,40 @@ define i16 @eorv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: eorv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: eor w9, w11, w10 +; NONEON-NOSVE-NEXT: eor w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: eor w13, w15, w14 +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: eor w9, w12, w13 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #12] +; NONEON-NOSVE-NEXT: eor w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #14] +; NONEON-NOSVE-NEXT: eor w10, w14, w10 +; NONEON-NOSVE-NEXT: eor w11, w15, w11 +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w9, w10, w11 +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w9, w13, w12 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op) @@ -524,9 +815,12 @@ define i32 @eorv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a) ret i32 %res @@ -545,12 +839,11 @@ define i32 @eorv_v4i32(<4 x i32> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp], #16 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: eor w0, w10, w8 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) ret i32 %res @@ -569,15 +862,20 @@ define i32 @eorv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: eorv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: eor w9, w11, w9 +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: eor w10, w14, w12 +; NONEON-NOSVE-NEXT: eor w11, w15, w13 +; NONEON-NOSVE-NEXT: eor w9, w10, w11 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op) @@ -597,10 +895,8 @@ define i64 @eorv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: eor x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a) ret i64 %res @@ -619,13 +915,13 @@ define i64 @eorv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: eorv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp], #32 +; NONEON-NOSVE-NEXT: eor x8, x10, x8 +; NONEON-NOSVE-NEXT: eor x9, x11, x9 +; NONEON-NOSVE-NEXT: eor x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op) @@ -647,10 +943,17 @@ define i8 @orv_v4i8(<4 x i8> %a) { ; ; NONEON-NOSVE-LABEL: orv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a) ret i8 %res @@ -667,11 +970,25 @@ define i8 @orv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: orv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: orr w12, w13, w12 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w10, w12, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w14 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a) ret i8 %res @@ -690,13 +1007,37 @@ define i8 @orv_v16i8(<16 x i8> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #2] +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: orr w11, w14, w13 +; NONEON-NOSVE-NEXT: orr w9, w12, w9 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #7] +; NONEON-NOSVE-NEXT: orr w10, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: orr w9, w9, w16 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #9] +; NONEON-NOSVE-NEXT: orr w12, w12, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w13, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #15] +; NONEON-NOSVE-NEXT: orr w12, w12, w14 +; NONEON-NOSVE-NEXT: orr w8, w8, w11 +; NONEON-NOSVE-NEXT: orr w9, w10, w9 +; NONEON-NOSVE-NEXT: orr w10, w12, w16 +; NONEON-NOSVE-NEXT: orr w8, w8, w15 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w13 +; NONEON-NOSVE-NEXT: orr w0, w9, w8 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a) @@ -716,17 +1057,72 @@ define i8 @orv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: orv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: orr w9, w11, w10 +; NONEON-NOSVE-NEXT: orr w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: orr w11, w15, w14 +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: orr w9, w10, w11 +; NONEON-NOSVE-NEXT: orr w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #6] +; NONEON-NOSVE-NEXT: orr w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: orr w10, w14, w10 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #7] +; NONEON-NOSVE-NEXT: orr w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w9, w10, w11 +; NONEON-NOSVE-NEXT: orr w10, w14, w13 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #9] +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w11, w15, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #26] +; NONEON-NOSVE-NEXT: orr w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: orr w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #14] +; NONEON-NOSVE-NEXT: orr w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #15] +; NONEON-NOSVE-NEXT: orr w10, w13, w10 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: orr w14, w15, w14 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #30] +; NONEON-NOSVE-NEXT: orr w9, w9, w14 +; NONEON-NOSVE-NEXT: orr w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #31] +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w10, w10, w12 +; NONEON-NOSVE-NEXT: orr w11, w16, w11 +; NONEON-NOSVE-NEXT: orr w10, w10, w11 +; NONEON-NOSVE-NEXT: orr w11, w17, w13 +; NONEON-NOSVE-NEXT: orr w9, w10, w11 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op) @@ -744,9 +1140,12 @@ define i16 @orv_v2i16(<2 x i16> %a) { ; ; NONEON-NOSVE-LABEL: orv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a) ret i16 %res @@ -763,10 +1162,17 @@ define i16 @orv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: orv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a) ret i16 %res @@ -785,11 +1191,20 @@ define i16 @orv_v8i16(<8 x i16> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w13, [sp] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: orr w12, w13, w12 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w10, w12, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w14 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret @@ -810,16 +1225,40 @@ define i16 @orv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: orv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: orr w9, w11, w10 +; NONEON-NOSVE-NEXT: orr w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: orr w13, w15, w14 +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: orr w9, w12, w13 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #12] +; NONEON-NOSVE-NEXT: orr w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #14] +; NONEON-NOSVE-NEXT: orr w10, w14, w10 +; NONEON-NOSVE-NEXT: orr w11, w15, w11 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w9, w10, w11 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w9, w13, w12 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op) @@ -837,9 +1276,12 @@ define i32 @orv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: orv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a) ret i32 %res @@ -858,12 +1300,11 @@ define i32 @orv_v4i32(<4 x i32> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp], #16 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: orr w0, w10, w8 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) ret i32 %res @@ -882,15 +1323,20 @@ define i32 @orv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: orv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: orr w9, w11, w9 +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: orr w10, w14, w12 +; NONEON-NOSVE-NEXT: orr w11, w15, w13 +; NONEON-NOSVE-NEXT: orr w9, w10, w11 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op) @@ -910,10 +1356,8 @@ define i64 @orv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: orr x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a) ret i64 %res @@ -932,13 +1376,13 @@ define i64 @orv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: orv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp], #32 +; NONEON-NOSVE-NEXT: orr x8, x10, x8 +; NONEON-NOSVE-NEXT: orr x9, x11, x9 +; NONEON-NOSVE-NEXT: orr x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index 6c33613f8e757d..be335c697707de 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -23,40 +23,83 @@ define <4 x i8> @masked_load_v4i8(ptr %src, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI0_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: str d0, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #116] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #118] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #112] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB0_2 ; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[0], [x0] +; NONEON-NOSVE-NEXT: ldrb w9, [x0] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #110] +; NONEON-NOSVE-NEXT: stur wzr, [sp, #106] +; NONEON-NOSVE-NEXT: strh w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #104] ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB0_3 ; NONEON-NOSVE-NEXT: b .LBB0_4 ; NONEON-NOSVE-NEXT: .LBB0_2: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI0_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI0_0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB0_4 ; NONEON-NOSVE-NEXT: .LBB0_3: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #1 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #1] +; NONEON-NOSVE-NEXT: str d0, [sp, #80] +; NONEON-NOSVE-NEXT: strh w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #84] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #96] +; NONEON-NOSVE-NEXT: str w9, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #80] +; NONEON-NOSVE-NEXT: str d0, [sp, #72] +; NONEON-NOSVE-NEXT: strh w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #72] +; NONEON-NOSVE-NEXT: strh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] ; NONEON-NOSVE-NEXT: .LBB0_4: // %else2 ; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB0_7 ; NONEON-NOSVE-NEXT: // %bb.5: // %else5 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB0_8 ; NONEON-NOSVE-NEXT: .LBB0_6: // %else8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB0_7: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: str d0, [sp, #40] +; NONEON-NOSVE-NEXT: str w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB0_6 ; NONEON-NOSVE-NEXT: .LBB0_8: // %cond.load7 -; NONEON-NOSVE-NEXT: add x8, x0, #3 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #3] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %load = call <4 x i8> @llvm.masked.load.v4i8(ptr %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer) ret <4 x i8> %load @@ -76,64 +119,183 @@ define <8 x i8> @masked_load_v8i8(ptr %src, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI1_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w8, s0 -; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB1_2 +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 272 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str d0, [sp, #240] +; NONEON-NOSVE-NEXT: add x9, sp, #176 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #242] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #243] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #241] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #244] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #245] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #246] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #240] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #247] +; NONEON-NOSVE-NEXT: and w11, w11, #0x2 +; NONEON-NOSVE-NEXT: and w13, w13, #0x10 +; NONEON-NOSVE-NEXT: bfxil w11, w12, #0, #1 +; NONEON-NOSVE-NEXT: and w12, w14, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w13 +; NONEON-NOSVE-NEXT: and w13, w15, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w11, w8 +; NONEON-NOSVE-NEXT: orr w11, w12, w13 +; NONEON-NOSVE-NEXT: orr w8, w8, w11 +; NONEON-NOSVE-NEXT: and w10, w10, #0x80 +; NONEON-NOSVE-NEXT: add w10, w8, w10 +; NONEON-NOSVE-NEXT: and w8, w10, #0xff +; NONEON-NOSVE-NEXT: tbz w10, #0, .LBB1_2 ; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load -; NONEON-NOSVE-NEXT: ldr b0, [x0] +; NONEON-NOSVE-NEXT: ldrb w10, [x0] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #239] +; NONEON-NOSVE-NEXT: sturh wzr, [x9, #61] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #57] +; NONEON-NOSVE-NEXT: strb w10, [sp, #232] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #232] ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB1_3 ; NONEON-NOSVE-NEXT: b .LBB1_4 ; NONEON-NOSVE-NEXT: .LBB1_2: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x10, .LCPI1_0 +; NONEON-NOSVE-NEXT: ldr d0, [x10, :lo12:.LCPI1_0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB1_4 ; NONEON-NOSVE-NEXT: .LBB1_3: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #1 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[1], [x9] +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #1] +; NONEON-NOSVE-NEXT: str d0, [sp, #208] +; NONEON-NOSVE-NEXT: strb w10, [sp, #224] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #214] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w10, [sp, #222] +; NONEON-NOSVE-NEXT: str d0, [sp, #200] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #34] +; NONEON-NOSVE-NEXT: stur w10, [x9, #42] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #208] +; NONEON-NOSVE-NEXT: strb w10, [sp, #216] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #200] +; NONEON-NOSVE-NEXT: strb w10, [sp, #217] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #216] ; NONEON-NOSVE-NEXT: .LBB1_4: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB1_11 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB1_12 ; NONEON-NOSVE-NEXT: // %bb.5: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB1_12 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB1_13 ; NONEON-NOSVE-NEXT: .LBB1_6: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB1_13 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB1_14 ; NONEON-NOSVE-NEXT: .LBB1_7: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB1_14 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB1_15 ; NONEON-NOSVE-NEXT: .LBB1_8: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB1_15 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB1_16 ; NONEON-NOSVE-NEXT: .LBB1_9: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB1_16 -; NONEON-NOSVE-NEXT: .LBB1_10: // %else20 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB1_11 +; NONEON-NOSVE-NEXT: .LBB1_10: // %cond.load19 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #7] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: .LBB1_11: // %else20 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #256] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB1_11: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] +; NONEON-NOSVE-NEXT: .LBB1_12: // %cond.load4 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #176] +; NONEON-NOSVE-NEXT: strb w10, [sp, #192] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #183] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #192] +; NONEON-NOSVE-NEXT: strb w10, [sp, #191] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #3] +; NONEON-NOSVE-NEXT: stur w10, [x9, #11] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #176] +; NONEON-NOSVE-NEXT: strh w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #168] +; NONEON-NOSVE-NEXT: strb w9, [sp, #186] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #184] ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB1_6 -; NONEON-NOSVE-NEXT: .LBB1_12: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #3 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[3], [x9] +; NONEON-NOSVE-NEXT: .LBB1_13: // %cond.load7 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #3] +; NONEON-NOSVE-NEXT: str d0, [sp, #144] +; NONEON-NOSVE-NEXT: strb w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #148] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #160] +; NONEON-NOSVE-NEXT: str w9, [sp, #156] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #146] +; NONEON-NOSVE-NEXT: str d0, [sp, #136] +; NONEON-NOSVE-NEXT: strb w9, [sp, #154] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #144] +; NONEON-NOSVE-NEXT: strh w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #136] +; NONEON-NOSVE-NEXT: strb w9, [sp, #155] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #152] ; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB1_7 -; NONEON-NOSVE-NEXT: .LBB1_13: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] +; NONEON-NOSVE-NEXT: .LBB1_14: // %cond.load10 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #112] +; NONEON-NOSVE-NEXT: strb w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #119] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #127] +; NONEON-NOSVE-NEXT: ldurh w9, [sp, #117] +; NONEON-NOSVE-NEXT: str d0, [sp, #104] +; NONEON-NOSVE-NEXT: sturh w9, [sp, #125] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] +; NONEON-NOSVE-NEXT: str w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #104] +; NONEON-NOSVE-NEXT: strb w9, [sp, #124] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #120] ; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB1_8 -; NONEON-NOSVE-NEXT: .LBB1_14: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #5 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[5], [x9] +; NONEON-NOSVE-NEXT: .LBB1_15: // %cond.load13 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #5] +; NONEON-NOSVE-NEXT: str d0, [sp, #80] +; NONEON-NOSVE-NEXT: strb w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #84] +; NONEON-NOSVE-NEXT: str d0, [sp, #72] +; NONEON-NOSVE-NEXT: strb w9, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] +; NONEON-NOSVE-NEXT: str w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #72] +; NONEON-NOSVE-NEXT: strb w9, [sp, #93] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] ; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB1_9 -; NONEON-NOSVE-NEXT: .LBB1_15: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB1_10 -; NONEON-NOSVE-NEXT: .LBB1_16: // %cond.load19 -; NONEON-NOSVE-NEXT: add x8, x0, #7 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[7], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB1_16: // %cond.load16 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #6] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: strb w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: str d0, [sp, #40] +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: str w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB1_10 +; NONEON-NOSVE-NEXT: b .LBB1_11 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer) ret <8 x i8> %load } @@ -152,112 +314,413 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 -; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #1024 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1040 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp, #976] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #984] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1000] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #976] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #992] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #991] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1007] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #990] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1006] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #989] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1005] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #988] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1004] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #987] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1003] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #986] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1002] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #985] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1001] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #983] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #999] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #982] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #998] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #981] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #997] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #980] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #996] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #979] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #995] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #978] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #994] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #977] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #993] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #992] ; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: addv h1, v0.8h -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB2_17 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_18 -; NONEON-NOSVE-NEXT: .LBB2_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB2_19 -; NONEON-NOSVE-NEXT: .LBB2_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB2_20 -; NONEON-NOSVE-NEXT: .LBB2_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB2_21 -; NONEON-NOSVE-NEXT: .LBB2_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB2_22 -; NONEON-NOSVE-NEXT: .LBB2_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB2_23 -; NONEON-NOSVE-NEXT: .LBB2_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB2_24 -; NONEON-NOSVE-NEXT: .LBB2_8: // %else20 -; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB2_25 -; NONEON-NOSVE-NEXT: .LBB2_9: // %else23 -; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB2_26 -; NONEON-NOSVE-NEXT: .LBB2_10: // %else26 -; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB2_27 -; NONEON-NOSVE-NEXT: .LBB2_11: // %else29 -; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB2_28 -; NONEON-NOSVE-NEXT: .LBB2_12: // %else32 -; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB2_29 -; NONEON-NOSVE-NEXT: .LBB2_13: // %else35 -; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB2_30 -; NONEON-NOSVE-NEXT: .LBB2_14: // %else38 -; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB2_31 -; NONEON-NOSVE-NEXT: .LBB2_15: // %else41 -; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_32 -; NONEON-NOSVE-NEXT: .LBB2_16: // %else44 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB2_17: // %cond.load -; NONEON-NOSVE-NEXT: ldr b0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB2_2 -; NONEON-NOSVE-NEXT: .LBB2_18: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #1 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB2_3 -; NONEON-NOSVE-NEXT: .LBB2_19: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB2_4 -; NONEON-NOSVE-NEXT: .LBB2_20: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #3 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB2_5 -; NONEON-NOSVE-NEXT: .LBB2_21: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB2_6 -; NONEON-NOSVE-NEXT: .LBB2_22: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #5 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB2_7 -; NONEON-NOSVE-NEXT: .LBB2_23: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB2_8 -; NONEON-NOSVE-NEXT: .LBB2_24: // %cond.load19 -; NONEON-NOSVE-NEXT: add x9, x0, #7 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[7], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB2_9 -; NONEON-NOSVE-NEXT: .LBB2_25: // %cond.load22 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[8], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB2_10 -; NONEON-NOSVE-NEXT: .LBB2_26: // %cond.load25 -; NONEON-NOSVE-NEXT: add x9, x0, #9 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[9], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB2_11 -; NONEON-NOSVE-NEXT: .LBB2_27: // %cond.load28 -; NONEON-NOSVE-NEXT: add x9, x0, #10 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[10], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB2_12 -; NONEON-NOSVE-NEXT: .LBB2_28: // %cond.load31 -; NONEON-NOSVE-NEXT: add x9, x0, #11 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[11], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB2_13 -; NONEON-NOSVE-NEXT: .LBB2_29: // %cond.load34 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[12], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB2_14 -; NONEON-NOSVE-NEXT: .LBB2_30: // %cond.load37 -; NONEON-NOSVE-NEXT: add x9, x0, #13 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[13], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB2_15 -; NONEON-NOSVE-NEXT: .LBB2_31: // %cond.load40 -; NONEON-NOSVE-NEXT: add x9, x0, #14 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[14], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_16 -; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.load43 -; NONEON-NOSVE-NEXT: add x8, x0, #15 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[15], [x8] +; NONEON-NOSVE-NEXT: str q0, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #1010] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1012] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1014] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #1016] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #1018] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #1020] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w12, w13 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w14 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1022] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add x9, sp, #720 +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB2_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldrb w10, [x0] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #975] +; NONEON-NOSVE-NEXT: sturh wzr, [x9, #253] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #249] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #241] +; NONEON-NOSVE-NEXT: strb w10, [sp, #960] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #960] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_3 +; NONEON-NOSVE-NEXT: b .LBB2_4 +; NONEON-NOSVE-NEXT: .LBB2_2: +; NONEON-NOSVE-NEXT: adrp x10, .LCPI2_0 +; NONEON-NOSVE-NEXT: ldr q0, [x10, :lo12:.LCPI2_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB2_4 +; NONEON-NOSVE-NEXT: .LBB2_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #1] +; NONEON-NOSVE-NEXT: str q0, [sp, #912] +; NONEON-NOSVE-NEXT: strb w10, [sp, #944] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #926] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #944] +; NONEON-NOSVE-NEXT: strh w10, [sp, #942] +; NONEON-NOSVE-NEXT: str q0, [sp, #896] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #194] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: stur x11, [x9, #210] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #912] +; NONEON-NOSVE-NEXT: strb w10, [sp, #928] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #896] +; NONEON-NOSVE-NEXT: strb w10, [sp, #929] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #928] +; NONEON-NOSVE-NEXT: .LBB2_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB2_20 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB2_21 +; NONEON-NOSVE-NEXT: .LBB2_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB2_22 +; NONEON-NOSVE-NEXT: .LBB2_7: // %else11 +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB2_23 +; NONEON-NOSVE-NEXT: .LBB2_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB2_24 +; NONEON-NOSVE-NEXT: .LBB2_9: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB2_25 +; NONEON-NOSVE-NEXT: .LBB2_10: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB2_26 +; NONEON-NOSVE-NEXT: .LBB2_11: // %else23 +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB2_27 +; NONEON-NOSVE-NEXT: .LBB2_12: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB2_28 +; NONEON-NOSVE-NEXT: .LBB2_13: // %else29 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB2_29 +; NONEON-NOSVE-NEXT: .LBB2_14: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB2_30 +; NONEON-NOSVE-NEXT: .LBB2_15: // %else35 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB2_31 +; NONEON-NOSVE-NEXT: .LBB2_16: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB2_32 +; NONEON-NOSVE-NEXT: .LBB2_17: // %else41 +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_19 +; NONEON-NOSVE-NEXT: .LBB2_18: // %cond.load43 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #15] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB2_19: // %else44 +; NONEON-NOSVE-NEXT: add sp, sp, #1024 +; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB2_20: // %cond.load4 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: str q0, [sp, #848] +; NONEON-NOSVE-NEXT: strb w10, [sp, #880] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #863] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #880] +; NONEON-NOSVE-NEXT: strb w10, [sp, #879] +; NONEON-NOSVE-NEXT: str q0, [sp, #832] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #131] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: stur x11, [x9, #147] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #848] +; NONEON-NOSVE-NEXT: strh w10, [sp, #864] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #832] +; NONEON-NOSVE-NEXT: strb w10, [sp, #866] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #864] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB2_6 +; NONEON-NOSVE-NEXT: .LBB2_21: // %cond.load7 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #3] +; NONEON-NOSVE-NEXT: str q0, [sp, #784] +; NONEON-NOSVE-NEXT: strb w10, [sp, #816] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #796] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #816] +; NONEON-NOSVE-NEXT: str w10, [sp, #812] +; NONEON-NOSVE-NEXT: str q0, [sp, #768] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #68] +; NONEON-NOSVE-NEXT: stur x10, [x9, #84] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #786] +; NONEON-NOSVE-NEXT: strb w10, [sp, #802] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #784] +; NONEON-NOSVE-NEXT: strh w10, [sp, #800] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #768] +; NONEON-NOSVE-NEXT: strb w10, [sp, #803] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #800] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB2_7 +; NONEON-NOSVE-NEXT: .LBB2_22: // %cond.load10 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #720] +; NONEON-NOSVE-NEXT: strb w10, [sp, #752] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #735] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #752] +; NONEON-NOSVE-NEXT: strb w10, [sp, #751] +; NONEON-NOSVE-NEXT: str q0, [sp, #704] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #5] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur x11, [x9, #21] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #720] +; NONEON-NOSVE-NEXT: str w9, [sp, #736] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #704] +; NONEON-NOSVE-NEXT: strb w9, [sp, #740] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #736] +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB2_8 +; NONEON-NOSVE-NEXT: .LBB2_23: // %cond.load13 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #5] +; NONEON-NOSVE-NEXT: str q0, [sp, #656] +; NONEON-NOSVE-NEXT: strb w10, [sp, #688] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #670] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #688] +; NONEON-NOSVE-NEXT: strh w10, [sp, #686] +; NONEON-NOSVE-NEXT: str q0, [sp, #640] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #198] +; NONEON-NOSVE-NEXT: stur x10, [x9, #214] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #660] +; NONEON-NOSVE-NEXT: strb w10, [sp, #676] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #656] +; NONEON-NOSVE-NEXT: str w10, [sp, #672] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #640] +; NONEON-NOSVE-NEXT: strb w10, [sp, #677] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #672] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB2_9 +; NONEON-NOSVE-NEXT: .LBB2_24: // %cond.load16 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #6] +; NONEON-NOSVE-NEXT: str q0, [sp, #592] +; NONEON-NOSVE-NEXT: strb w10, [sp, #624] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #607] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #624] +; NONEON-NOSVE-NEXT: strb w10, [sp, #623] +; NONEON-NOSVE-NEXT: str q0, [sp, #576] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #135] +; NONEON-NOSVE-NEXT: stur x10, [x9, #151] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #596] +; NONEON-NOSVE-NEXT: strh w10, [sp, #612] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #592] +; NONEON-NOSVE-NEXT: str w10, [sp, #608] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #576] +; NONEON-NOSVE-NEXT: strb w10, [sp, #614] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #608] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB2_10 +; NONEON-NOSVE-NEXT: .LBB2_25: // %cond.load19 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #7] +; NONEON-NOSVE-NEXT: str q0, [sp, #528] +; NONEON-NOSVE-NEXT: strb w10, [sp, #560] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #536] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #560] +; NONEON-NOSVE-NEXT: str x10, [sp, #552] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #534] +; NONEON-NOSVE-NEXT: str q0, [sp, #512] +; NONEON-NOSVE-NEXT: strb w10, [sp, #550] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #532] +; NONEON-NOSVE-NEXT: strh w10, [sp, #548] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #528] +; NONEON-NOSVE-NEXT: str w10, [sp, #544] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #512] +; NONEON-NOSVE-NEXT: strb w10, [sp, #551] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #544] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB2_11 +; NONEON-NOSVE-NEXT: .LBB2_26: // %cond.load22 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #464] +; NONEON-NOSVE-NEXT: strb w10, [sp, #496] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #479] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #496] +; NONEON-NOSVE-NEXT: strb w10, [sp, #495] +; NONEON-NOSVE-NEXT: str q0, [sp, #448] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur w11, [x9, #9] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur w11, [x9, #25] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #464] +; NONEON-NOSVE-NEXT: str x9, [sp, #480] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #448] +; NONEON-NOSVE-NEXT: strb w9, [sp, #488] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #480] +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB2_12 +; NONEON-NOSVE-NEXT: .LBB2_27: // %cond.load25 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #9] +; NONEON-NOSVE-NEXT: str q0, [sp, #400] +; NONEON-NOSVE-NEXT: strb w10, [sp, #432] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #414] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #432] +; NONEON-NOSVE-NEXT: strh w10, [sp, #430] +; NONEON-NOSVE-NEXT: str q0, [sp, #384] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #408] +; NONEON-NOSVE-NEXT: strb w10, [sp, #424] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #400] +; NONEON-NOSVE-NEXT: str x10, [sp, #416] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #384] +; NONEON-NOSVE-NEXT: strb w10, [sp, #425] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #416] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB2_13 +; NONEON-NOSVE-NEXT: .LBB2_28: // %cond.load28 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #10] +; NONEON-NOSVE-NEXT: str q0, [sp, #336] +; NONEON-NOSVE-NEXT: strb w10, [sp, #368] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #351] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #368] +; NONEON-NOSVE-NEXT: strb w10, [sp, #367] +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #344] +; NONEON-NOSVE-NEXT: strh w10, [sp, #360] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #336] +; NONEON-NOSVE-NEXT: str x10, [sp, #352] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #320] +; NONEON-NOSVE-NEXT: strb w10, [sp, #362] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB2_14 +; NONEON-NOSVE-NEXT: .LBB2_29: // %cond.load31 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #11] +; NONEON-NOSVE-NEXT: str q0, [sp, #272] +; NONEON-NOSVE-NEXT: strb w10, [sp, #304] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #284] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #304] +; NONEON-NOSVE-NEXT: str w10, [sp, #300] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #282] +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: strb w10, [sp, #298] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #280] +; NONEON-NOSVE-NEXT: strh w10, [sp, #296] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #272] +; NONEON-NOSVE-NEXT: str x10, [sp, #288] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #256] +; NONEON-NOSVE-NEXT: strb w10, [sp, #299] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB2_15 +; NONEON-NOSVE-NEXT: .LBB2_30: // %cond.load34 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: strb w10, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #223] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #240] +; NONEON-NOSVE-NEXT: strb w10, [sp, #239] +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #216] +; NONEON-NOSVE-NEXT: str w9, [sp, #232] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #208] +; NONEON-NOSVE-NEXT: str x9, [sp, #224] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #192] +; NONEON-NOSVE-NEXT: strb w9, [sp, #236] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB2_16 +; NONEON-NOSVE-NEXT: .LBB2_31: // %cond.load37 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #13] +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: strb w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #158] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w9, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #156] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #172] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #152] +; NONEON-NOSVE-NEXT: str w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #144] +; NONEON-NOSVE-NEXT: str x9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #173] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB2_17 +; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.load40 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #14] +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: strb w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #95] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #112] +; NONEON-NOSVE-NEXT: strb w9, [sp, #111] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #92] +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #108] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: str w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #80] +; NONEON-NOSVE-NEXT: str x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #64] +; NONEON-NOSVE-NEXT: strb w9, [sp, #110] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_18 +; NONEON-NOSVE-NEXT: b .LBB2_19 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer) ret <16 x i8> %load } @@ -342,274 +805,815 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] -; NONEON-NOSVE-NEXT: fmov s1, w1 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] -; NONEON-NOSVE-NEXT: mov v1.b[1], w2 -; NONEON-NOSVE-NEXT: mov v0.b[1], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp] -; NONEON-NOSVE-NEXT: mov v1.b[2], w3 -; NONEON-NOSVE-NEXT: mov v0.b[2], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] -; NONEON-NOSVE-NEXT: mov v1.b[3], w4 -; NONEON-NOSVE-NEXT: mov v0.b[3], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] -; NONEON-NOSVE-NEXT: mov v1.b[4], w5 -; NONEON-NOSVE-NEXT: mov v0.b[4], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] -; NONEON-NOSVE-NEXT: mov v1.b[5], w6 -; NONEON-NOSVE-NEXT: mov v0.b[5], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] -; NONEON-NOSVE-NEXT: mov v1.b[6], w7 -; NONEON-NOSVE-NEXT: mov v0.b[6], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] -; NONEON-NOSVE-NEXT: mov v1.b[7], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] -; NONEON-NOSVE-NEXT: mov v0.b[7], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] -; NONEON-NOSVE-NEXT: mov v1.b[8], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] -; NONEON-NOSVE-NEXT: mov v0.b[8], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] -; NONEON-NOSVE-NEXT: mov v1.b[9], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] -; NONEON-NOSVE-NEXT: mov v0.b[9], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] -; NONEON-NOSVE-NEXT: mov v1.b[10], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] -; NONEON-NOSVE-NEXT: mov v0.b[10], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] -; NONEON-NOSVE-NEXT: mov v1.b[11], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] -; NONEON-NOSVE-NEXT: mov v0.b[11], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #168] -; NONEON-NOSVE-NEXT: mov v1.b[12], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] -; NONEON-NOSVE-NEXT: mov v0.b[12], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] -; NONEON-NOSVE-NEXT: mov v1.b[13], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #56] -; NONEON-NOSVE-NEXT: mov v0.b[13], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] -; NONEON-NOSVE-NEXT: mov v1.b[14], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #64] -; NONEON-NOSVE-NEXT: mov v0.b[14], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] -; NONEON-NOSVE-NEXT: mov v1.b[15], w9 -; NONEON-NOSVE-NEXT: mov v0.b[15], w8 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 -; NONEON-NOSVE-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] -; NONEON-NOSVE-NEXT: shl v1.16b, v1.16b, #7 -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: cmlt v1.16b, v1.16b, #0 -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; NONEON-NOSVE-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: zip1 v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: addv h1, v1.8h -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: bfi w8, w9, #16, #16 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_33 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_34 -; NONEON-NOSVE-NEXT: .LBB3_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_35 -; NONEON-NOSVE-NEXT: .LBB3_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_36 -; NONEON-NOSVE-NEXT: .LBB3_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_37 -; NONEON-NOSVE-NEXT: .LBB3_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_38 -; NONEON-NOSVE-NEXT: .LBB3_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_39 -; NONEON-NOSVE-NEXT: .LBB3_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_40 -; NONEON-NOSVE-NEXT: .LBB3_8: // %else20 -; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_41 -; NONEON-NOSVE-NEXT: .LBB3_9: // %else23 -; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_42 -; NONEON-NOSVE-NEXT: .LBB3_10: // %else26 -; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_43 -; NONEON-NOSVE-NEXT: .LBB3_11: // %else29 -; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_44 -; NONEON-NOSVE-NEXT: .LBB3_12: // %else32 -; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_45 -; NONEON-NOSVE-NEXT: .LBB3_13: // %else35 -; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_46 -; NONEON-NOSVE-NEXT: .LBB3_14: // %else38 -; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_47 -; NONEON-NOSVE-NEXT: .LBB3_15: // %else41 -; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_48 -; NONEON-NOSVE-NEXT: .LBB3_16: // %else44 -; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_49 -; NONEON-NOSVE-NEXT: .LBB3_17: // %else47 -; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_50 -; NONEON-NOSVE-NEXT: .LBB3_18: // %else50 -; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_51 -; NONEON-NOSVE-NEXT: .LBB3_19: // %else53 -; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_52 -; NONEON-NOSVE-NEXT: .LBB3_20: // %else56 -; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_53 -; NONEON-NOSVE-NEXT: .LBB3_21: // %else59 -; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_54 -; NONEON-NOSVE-NEXT: .LBB3_22: // %else62 -; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_55 -; NONEON-NOSVE-NEXT: .LBB3_23: // %else65 -; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_56 -; NONEON-NOSVE-NEXT: .LBB3_24: // %else68 -; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_57 -; NONEON-NOSVE-NEXT: .LBB3_25: // %else71 -; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_58 -; NONEON-NOSVE-NEXT: .LBB3_26: // %else74 -; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_59 -; NONEON-NOSVE-NEXT: .LBB3_27: // %else77 -; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_60 -; NONEON-NOSVE-NEXT: .LBB3_28: // %else80 -; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_61 -; NONEON-NOSVE-NEXT: .LBB3_29: // %else83 -; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_62 -; NONEON-NOSVE-NEXT: .LBB3_30: // %else86 -; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_63 -; NONEON-NOSVE-NEXT: .LBB3_31: // %else89 -; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_64 -; NONEON-NOSVE-NEXT: .LBB3_32: // %else92 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB3_33: // %cond.load -; NONEON-NOSVE-NEXT: ldr b0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_2 -; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #1 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB3_3 -; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_4 -; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #3 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_5 -; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_6 -; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #5 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_7 -; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_8 -; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.load19 -; NONEON-NOSVE-NEXT: add x9, x0, #7 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[7], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_9 -; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.load22 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[8], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_10 -; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.load25 -; NONEON-NOSVE-NEXT: add x9, x0, #9 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[9], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_11 -; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.load28 -; NONEON-NOSVE-NEXT: add x9, x0, #10 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[10], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_12 -; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.load31 -; NONEON-NOSVE-NEXT: add x9, x0, #11 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[11], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_13 -; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.load34 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[12], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_14 -; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.load37 -; NONEON-NOSVE-NEXT: add x9, x0, #13 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[13], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_15 -; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.load40 -; NONEON-NOSVE-NEXT: add x9, x0, #14 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[14], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_16 -; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.load43 -; NONEON-NOSVE-NEXT: add x9, x0, #15 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[15], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_17 -; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.load46 -; NONEON-NOSVE-NEXT: add x9, x0, #16 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[0], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_18 -; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.load49 -; NONEON-NOSVE-NEXT: add x9, x0, #17 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_19 -; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.load52 -; NONEON-NOSVE-NEXT: add x9, x0, #18 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_20 -; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.load55 -; NONEON-NOSVE-NEXT: add x9, x0, #19 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_21 -; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.load58 -; NONEON-NOSVE-NEXT: add x9, x0, #20 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_22 -; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.load61 -; NONEON-NOSVE-NEXT: add x9, x0, #21 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_23 -; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.load64 -; NONEON-NOSVE-NEXT: add x9, x0, #22 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_24 -; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.load67 -; NONEON-NOSVE-NEXT: add x9, x0, #23 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[7], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_25 -; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.load70 -; NONEON-NOSVE-NEXT: add x9, x0, #24 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[8], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_26 -; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.load73 -; NONEON-NOSVE-NEXT: add x9, x0, #25 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[9], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_27 -; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.load76 -; NONEON-NOSVE-NEXT: add x9, x0, #26 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[10], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_28 -; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.load79 -; NONEON-NOSVE-NEXT: add x9, x0, #27 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[11], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_29 -; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.load82 -; NONEON-NOSVE-NEXT: add x9, x0, #28 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[12], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_30 -; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.load85 -; NONEON-NOSVE-NEXT: add x9, x0, #29 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[13], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_31 -; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.load88 -; NONEON-NOSVE-NEXT: add x9, x0, #30 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[14], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_32 -; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.load91 -; NONEON-NOSVE-NEXT: add x8, x0, #31 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[15], [x8] +; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #2064 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 2080 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #2216] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2152] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2272] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #2176] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #2160] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2024] +; NONEON-NOSVE-NEXT: and w8, w9, #0x1 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2264] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2016] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2256] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2031] +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2248] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2030] +; NONEON-NOSVE-NEXT: and w8, w9, #0x20 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2240] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2029] +; NONEON-NOSVE-NEXT: and w8, w9, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2232] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2028] +; NONEON-NOSVE-NEXT: and w8, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2224] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2027] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2208] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2026] +; NONEON-NOSVE-NEXT: and w8, w9, #0x2 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2200] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2025] +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2192] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2023] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #2184] +; NONEON-NOSVE-NEXT: and w9, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: strb w9, [sp, #2022] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2168] +; NONEON-NOSVE-NEXT: and w10, w10, #0x20 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w10, [sp, #2021] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2020] +; NONEON-NOSVE-NEXT: and w8, w11, #0x8 +; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2019] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2088] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2018] +; NONEON-NOSVE-NEXT: and w8, w10, #0x2 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2136] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2017] +; NONEON-NOSVE-NEXT: and w8, w9, #0x1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2008] +; NONEON-NOSVE-NEXT: and w8, w1, #0x1 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #2104] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2000] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #2080] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldr q0, [sp, #2016] +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2128] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2015] +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2120] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2014] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #2112] +; NONEON-NOSVE-NEXT: and w9, w9, #0x20 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: strb w9, [sp, #2013] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2096] +; NONEON-NOSVE-NEXT: and w10, w10, #0x10 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w10, [sp, #2012] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2011] +; NONEON-NOSVE-NEXT: and w8, w11, #0x4 +; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2010] +; NONEON-NOSVE-NEXT: and w8, w9, #0x2 +; NONEON-NOSVE-NEXT: sbfx w9, w7, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2009] +; NONEON-NOSVE-NEXT: and w8, w10, #0x80 +; NONEON-NOSVE-NEXT: sbfx w10, w6, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2007] +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w5, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2006] +; NONEON-NOSVE-NEXT: and w8, w10, #0x20 +; NONEON-NOSVE-NEXT: sbfx w10, w4, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2005] +; NONEON-NOSVE-NEXT: and w8, w9, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w3, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2004] +; NONEON-NOSVE-NEXT: and w8, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w10, w2, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2003] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2002] +; NONEON-NOSVE-NEXT: and w8, w10, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2001] +; NONEON-NOSVE-NEXT: str q0, [sp, #2048] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #2000] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2050] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2048] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2052] +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2054] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2056] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2058] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2060] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w12, w13 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: add w9, w9, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: str q0, [sp, #2032] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2034] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2032] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2036] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #2038] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #2040] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #2042] +; NONEON-NOSVE-NEXT: add w10, w12, w11 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2044] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2062] +; NONEON-NOSVE-NEXT: add w13, w13, w14 +; NONEON-NOSVE-NEXT: add w14, w15, w16 +; NONEON-NOSVE-NEXT: add w10, w10, w13 +; NONEON-NOSVE-NEXT: add w11, w14, w11 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2046] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w8, w12 +; NONEON-NOSVE-NEXT: add w8, w9, w13 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI3_0 +; NONEON-NOSVE-NEXT: bfi w8, w10, #16, #16 +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI3_0] +; NONEON-NOSVE-NEXT: add x9, sp, #1744 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB3_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldrb w10, [x0] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #1999] +; NONEON-NOSVE-NEXT: sturh wzr, [x9, #253] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #249] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #241] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1984] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1984] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_3 +; NONEON-NOSVE-NEXT: b .LBB3_4 +; NONEON-NOSVE-NEXT: .LBB3_2: +; NONEON-NOSVE-NEXT: mov v0.16b, v1.16b +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_4 +; NONEON-NOSVE-NEXT: .LBB3_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #1] +; NONEON-NOSVE-NEXT: str q0, [sp, #1936] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1968] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1950] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1968] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1966] +; NONEON-NOSVE-NEXT: str q0, [sp, #1920] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #194] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: stur x11, [x9, #210] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1936] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1952] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1920] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1953] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1952] +; NONEON-NOSVE-NEXT: .LBB3_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_36 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_37 +; NONEON-NOSVE-NEXT: .LBB3_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_38 +; NONEON-NOSVE-NEXT: .LBB3_7: // %else11 +; NONEON-NOSVE-NEXT: add x9, sp, #1488 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_39 +; NONEON-NOSVE-NEXT: .LBB3_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_40 +; NONEON-NOSVE-NEXT: .LBB3_9: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_41 +; NONEON-NOSVE-NEXT: .LBB3_10: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_42 +; NONEON-NOSVE-NEXT: .LBB3_11: // %else23 +; NONEON-NOSVE-NEXT: add x9, sp, #1232 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_43 +; NONEON-NOSVE-NEXT: .LBB3_12: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_44 +; NONEON-NOSVE-NEXT: .LBB3_13: // %else29 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_45 +; NONEON-NOSVE-NEXT: .LBB3_14: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_46 +; NONEON-NOSVE-NEXT: .LBB3_15: // %else35 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_47 +; NONEON-NOSVE-NEXT: .LBB3_16: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_48 +; NONEON-NOSVE-NEXT: .LBB3_17: // %else41 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_49 +; NONEON-NOSVE-NEXT: .LBB3_18: // %else44 +; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_50 +; NONEON-NOSVE-NEXT: .LBB3_19: // %else47 +; NONEON-NOSVE-NEXT: add x9, sp, #720 +; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_51 +; NONEON-NOSVE-NEXT: .LBB3_20: // %else50 +; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_52 +; NONEON-NOSVE-NEXT: .LBB3_21: // %else53 +; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_53 +; NONEON-NOSVE-NEXT: .LBB3_22: // %else56 +; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_54 +; NONEON-NOSVE-NEXT: .LBB3_23: // %else59 +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_55 +; NONEON-NOSVE-NEXT: .LBB3_24: // %else62 +; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_56 +; NONEON-NOSVE-NEXT: .LBB3_25: // %else65 +; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_57 +; NONEON-NOSVE-NEXT: .LBB3_26: // %else68 +; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_58 +; NONEON-NOSVE-NEXT: .LBB3_27: // %else71 +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_59 +; NONEON-NOSVE-NEXT: .LBB3_28: // %else74 +; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_60 +; NONEON-NOSVE-NEXT: .LBB3_29: // %else77 +; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_61 +; NONEON-NOSVE-NEXT: .LBB3_30: // %else80 +; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_62 +; NONEON-NOSVE-NEXT: .LBB3_31: // %else83 +; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_63 +; NONEON-NOSVE-NEXT: .LBB3_32: // %else86 +; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_64 +; NONEON-NOSVE-NEXT: .LBB3_33: // %else89 +; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_35 +; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.load91 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #31] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB3_35: // %else92 +; NONEON-NOSVE-NEXT: add sp, sp, #2064 +; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.load4 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: str q0, [sp, #1872] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1904] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1887] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1904] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1903] +; NONEON-NOSVE-NEXT: str q0, [sp, #1856] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #131] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: stur x11, [x9, #147] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1872] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1888] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1856] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1890] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1888] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_6 +; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.load7 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #3] +; NONEON-NOSVE-NEXT: str q0, [sp, #1808] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1840] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1820] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1840] +; NONEON-NOSVE-NEXT: str w10, [sp, #1836] +; NONEON-NOSVE-NEXT: str q0, [sp, #1792] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #68] +; NONEON-NOSVE-NEXT: stur x10, [x9, #84] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1810] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1826] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1808] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1824] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1792] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1827] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1824] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_7 +; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.load10 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #1744] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1776] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1759] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1776] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1775] +; NONEON-NOSVE-NEXT: str q0, [sp, #1728] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #5] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur x11, [x9, #21] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1744] +; NONEON-NOSVE-NEXT: str w9, [sp, #1760] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1728] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1764] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1760] +; NONEON-NOSVE-NEXT: add x9, sp, #1488 +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_8 +; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.load13 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #5] +; NONEON-NOSVE-NEXT: str q0, [sp, #1680] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1712] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1694] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1712] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1710] +; NONEON-NOSVE-NEXT: str q0, [sp, #1664] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #198] +; NONEON-NOSVE-NEXT: stur x10, [x9, #214] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1684] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1700] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1680] +; NONEON-NOSVE-NEXT: str w10, [sp, #1696] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1664] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1701] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1696] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_9 +; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.load16 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #6] +; NONEON-NOSVE-NEXT: str q0, [sp, #1616] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1648] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1631] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1648] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1647] +; NONEON-NOSVE-NEXT: str q0, [sp, #1600] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #135] +; NONEON-NOSVE-NEXT: stur x10, [x9, #151] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1620] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1636] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1616] +; NONEON-NOSVE-NEXT: str w10, [sp, #1632] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1600] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1638] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1632] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_10 +; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.load19 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #7] +; NONEON-NOSVE-NEXT: str q0, [sp, #1552] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1584] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #1560] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1584] +; NONEON-NOSVE-NEXT: str x10, [sp, #1576] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1558] +; NONEON-NOSVE-NEXT: str q0, [sp, #1536] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1574] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1556] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1572] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1552] +; NONEON-NOSVE-NEXT: str w10, [sp, #1568] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1536] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1575] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1568] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_11 +; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.load22 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #1488] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1520] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1503] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1520] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1519] +; NONEON-NOSVE-NEXT: str q0, [sp, #1472] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur w11, [x9, #9] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur w11, [x9, #25] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1488] +; NONEON-NOSVE-NEXT: str x9, [sp, #1504] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1472] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1512] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1504] +; NONEON-NOSVE-NEXT: add x9, sp, #1232 +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_12 +; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.load25 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #9] +; NONEON-NOSVE-NEXT: str q0, [sp, #1424] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1456] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1438] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1456] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1454] +; NONEON-NOSVE-NEXT: str q0, [sp, #1408] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1432] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1448] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #1424] +; NONEON-NOSVE-NEXT: str x10, [sp, #1440] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1408] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1449] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1440] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_13 +; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.load28 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #10] +; NONEON-NOSVE-NEXT: str q0, [sp, #1360] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1392] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1375] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1392] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1391] +; NONEON-NOSVE-NEXT: str q0, [sp, #1344] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1368] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1384] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #1360] +; NONEON-NOSVE-NEXT: str x10, [sp, #1376] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1344] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1386] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1376] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_14 +; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.load31 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #11] +; NONEON-NOSVE-NEXT: str q0, [sp, #1296] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1328] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1308] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1328] +; NONEON-NOSVE-NEXT: str w10, [sp, #1324] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1306] +; NONEON-NOSVE-NEXT: str q0, [sp, #1280] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1322] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1304] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1320] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #1296] +; NONEON-NOSVE-NEXT: str x10, [sp, #1312] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1280] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1323] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1312] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_15 +; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.load34 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #1232] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1264] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1247] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1264] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1263] +; NONEON-NOSVE-NEXT: str q0, [sp, #1216] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1240] +; NONEON-NOSVE-NEXT: str w9, [sp, #1256] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1232] +; NONEON-NOSVE-NEXT: str x9, [sp, #1248] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1216] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1260] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1248] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_16 +; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.load37 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #13] +; NONEON-NOSVE-NEXT: str q0, [sp, #1168] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1200] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1182] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1200] +; NONEON-NOSVE-NEXT: strh w9, [sp, #1198] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1180] +; NONEON-NOSVE-NEXT: str q0, [sp, #1152] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1196] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1176] +; NONEON-NOSVE-NEXT: str w9, [sp, #1192] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1168] +; NONEON-NOSVE-NEXT: str x9, [sp, #1184] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1152] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1197] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1184] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_17 +; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.load40 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #14] +; NONEON-NOSVE-NEXT: str q0, [sp, #1104] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1136] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1119] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1136] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1135] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1116] +; NONEON-NOSVE-NEXT: str q0, [sp, #1088] +; NONEON-NOSVE-NEXT: strh w9, [sp, #1132] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1112] +; NONEON-NOSVE-NEXT: str w9, [sp, #1128] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1104] +; NONEON-NOSVE-NEXT: str x9, [sp, #1120] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1088] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1134] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1120] +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_18 +; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.load43 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #15] +; NONEON-NOSVE-NEXT: str q0, [sp, #1024] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1072] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1038] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1072] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1070] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1036] +; NONEON-NOSVE-NEXT: str q0, [sp, #1040] +; NONEON-NOSVE-NEXT: strh w9, [sp, #1068] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1032] +; NONEON-NOSVE-NEXT: str w9, [sp, #1064] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1024] +; NONEON-NOSVE-NEXT: str x9, [sp, #1056] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1040] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1071] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1056] +; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_19 +; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.load46 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #976] +; NONEON-NOSVE-NEXT: add x10, sp, #976 +; NONEON-NOSVE-NEXT: strb w9, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #991] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #1008] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1007] +; NONEON-NOSVE-NEXT: str q1, [sp, #960] +; NONEON-NOSVE-NEXT: ldurh w9, [x10, #13] +; NONEON-NOSVE-NEXT: ldur w11, [x10, #9] +; NONEON-NOSVE-NEXT: sturh w9, [x10, #29] +; NONEON-NOSVE-NEXT: ldur x9, [x10, #1] +; NONEON-NOSVE-NEXT: stur w11, [x10, #25] +; NONEON-NOSVE-NEXT: stur x9, [x10, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #960] +; NONEON-NOSVE-NEXT: strb w9, [sp, #992] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #992] +; NONEON-NOSVE-NEXT: add x9, sp, #720 +; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_20 +; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.load49 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #17] +; NONEON-NOSVE-NEXT: str q1, [sp, #912] +; NONEON-NOSVE-NEXT: strb w10, [sp, #944] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #926] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #944] +; NONEON-NOSVE-NEXT: strh w10, [sp, #942] +; NONEON-NOSVE-NEXT: str q1, [sp, #896] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #194] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: stur x11, [x9, #210] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #912] +; NONEON-NOSVE-NEXT: strb w10, [sp, #928] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #896] +; NONEON-NOSVE-NEXT: strb w10, [sp, #929] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #928] +; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_21 +; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.load52 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #18] +; NONEON-NOSVE-NEXT: str q1, [sp, #848] +; NONEON-NOSVE-NEXT: strb w10, [sp, #880] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #863] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #880] +; NONEON-NOSVE-NEXT: strb w10, [sp, #879] +; NONEON-NOSVE-NEXT: str q1, [sp, #832] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #131] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: stur x11, [x9, #147] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #848] +; NONEON-NOSVE-NEXT: strh w10, [sp, #864] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #832] +; NONEON-NOSVE-NEXT: strb w10, [sp, #866] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #864] +; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_22 +; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.load55 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #19] +; NONEON-NOSVE-NEXT: str q1, [sp, #784] +; NONEON-NOSVE-NEXT: strb w10, [sp, #816] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #796] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #816] +; NONEON-NOSVE-NEXT: str w10, [sp, #812] +; NONEON-NOSVE-NEXT: str q1, [sp, #768] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #68] +; NONEON-NOSVE-NEXT: stur x10, [x9, #84] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #786] +; NONEON-NOSVE-NEXT: strb w10, [sp, #802] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #784] +; NONEON-NOSVE-NEXT: strh w10, [sp, #800] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #768] +; NONEON-NOSVE-NEXT: strb w10, [sp, #803] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #800] +; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_23 +; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.load58 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #20] +; NONEON-NOSVE-NEXT: str q1, [sp, #720] +; NONEON-NOSVE-NEXT: strb w10, [sp, #752] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #735] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #752] +; NONEON-NOSVE-NEXT: strb w10, [sp, #751] +; NONEON-NOSVE-NEXT: str q1, [sp, #704] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #5] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur x11, [x9, #21] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #720] +; NONEON-NOSVE-NEXT: str w9, [sp, #736] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #704] +; NONEON-NOSVE-NEXT: strb w9, [sp, #740] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #736] +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_24 +; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.load61 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #21] +; NONEON-NOSVE-NEXT: str q1, [sp, #656] +; NONEON-NOSVE-NEXT: strb w10, [sp, #688] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #670] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #688] +; NONEON-NOSVE-NEXT: strh w10, [sp, #686] +; NONEON-NOSVE-NEXT: str q1, [sp, #640] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #198] +; NONEON-NOSVE-NEXT: stur x10, [x9, #214] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #660] +; NONEON-NOSVE-NEXT: strb w10, [sp, #676] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #656] +; NONEON-NOSVE-NEXT: str w10, [sp, #672] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #640] +; NONEON-NOSVE-NEXT: strb w10, [sp, #677] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #672] +; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_25 +; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.load64 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #22] +; NONEON-NOSVE-NEXT: str q1, [sp, #592] +; NONEON-NOSVE-NEXT: strb w10, [sp, #624] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #607] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #624] +; NONEON-NOSVE-NEXT: strb w10, [sp, #623] +; NONEON-NOSVE-NEXT: str q1, [sp, #576] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #135] +; NONEON-NOSVE-NEXT: stur x10, [x9, #151] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #596] +; NONEON-NOSVE-NEXT: strh w10, [sp, #612] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #592] +; NONEON-NOSVE-NEXT: str w10, [sp, #608] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #576] +; NONEON-NOSVE-NEXT: strb w10, [sp, #614] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #608] +; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_26 +; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.load67 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #23] +; NONEON-NOSVE-NEXT: str q1, [sp, #528] +; NONEON-NOSVE-NEXT: strb w10, [sp, #560] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #536] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #560] +; NONEON-NOSVE-NEXT: str x10, [sp, #552] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #534] +; NONEON-NOSVE-NEXT: str q1, [sp, #512] +; NONEON-NOSVE-NEXT: strb w10, [sp, #550] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #532] +; NONEON-NOSVE-NEXT: strh w10, [sp, #548] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #528] +; NONEON-NOSVE-NEXT: str w10, [sp, #544] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #512] +; NONEON-NOSVE-NEXT: strb w10, [sp, #551] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #544] +; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_27 +; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.load70 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #24] +; NONEON-NOSVE-NEXT: str q1, [sp, #464] +; NONEON-NOSVE-NEXT: strb w10, [sp, #496] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #479] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #496] +; NONEON-NOSVE-NEXT: strb w10, [sp, #495] +; NONEON-NOSVE-NEXT: str q1, [sp, #448] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur w11, [x9, #9] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur w11, [x9, #25] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #464] +; NONEON-NOSVE-NEXT: str x9, [sp, #480] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #448] +; NONEON-NOSVE-NEXT: strb w9, [sp, #488] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #480] +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_28 +; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.load73 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #25] +; NONEON-NOSVE-NEXT: str q1, [sp, #400] +; NONEON-NOSVE-NEXT: strb w10, [sp, #432] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #414] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #432] +; NONEON-NOSVE-NEXT: strh w10, [sp, #430] +; NONEON-NOSVE-NEXT: str q1, [sp, #384] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #408] +; NONEON-NOSVE-NEXT: strb w10, [sp, #424] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #400] +; NONEON-NOSVE-NEXT: str x10, [sp, #416] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #384] +; NONEON-NOSVE-NEXT: strb w10, [sp, #425] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #416] +; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_29 +; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.load76 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #26] +; NONEON-NOSVE-NEXT: str q1, [sp, #336] +; NONEON-NOSVE-NEXT: strb w10, [sp, #368] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #351] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #368] +; NONEON-NOSVE-NEXT: strb w10, [sp, #367] +; NONEON-NOSVE-NEXT: str q1, [sp, #320] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #344] +; NONEON-NOSVE-NEXT: strh w10, [sp, #360] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #336] +; NONEON-NOSVE-NEXT: str x10, [sp, #352] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #320] +; NONEON-NOSVE-NEXT: strb w10, [sp, #362] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_30 +; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.load79 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #27] +; NONEON-NOSVE-NEXT: str q1, [sp, #272] +; NONEON-NOSVE-NEXT: strb w10, [sp, #304] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #284] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #304] +; NONEON-NOSVE-NEXT: str w10, [sp, #300] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #282] +; NONEON-NOSVE-NEXT: str q1, [sp, #256] +; NONEON-NOSVE-NEXT: strb w10, [sp, #298] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #280] +; NONEON-NOSVE-NEXT: strh w10, [sp, #296] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #272] +; NONEON-NOSVE-NEXT: str x10, [sp, #288] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #256] +; NONEON-NOSVE-NEXT: strb w10, [sp, #299] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_31 +; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.load82 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #28] +; NONEON-NOSVE-NEXT: str q1, [sp, #208] +; NONEON-NOSVE-NEXT: strb w10, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #223] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #240] +; NONEON-NOSVE-NEXT: strb w10, [sp, #239] +; NONEON-NOSVE-NEXT: str q1, [sp, #192] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #216] +; NONEON-NOSVE-NEXT: str w9, [sp, #232] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #208] +; NONEON-NOSVE-NEXT: str x9, [sp, #224] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #192] +; NONEON-NOSVE-NEXT: strb w9, [sp, #236] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_32 +; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.load85 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #29] +; NONEON-NOSVE-NEXT: str q1, [sp, #144] +; NONEON-NOSVE-NEXT: strb w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #158] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #176] +; NONEON-NOSVE-NEXT: strh w9, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #156] +; NONEON-NOSVE-NEXT: str q1, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #172] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #152] +; NONEON-NOSVE-NEXT: str w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #144] +; NONEON-NOSVE-NEXT: str x9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #173] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_33 +; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.load88 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #30] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #95] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #112] +; NONEON-NOSVE-NEXT: strb w9, [sp, #111] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #92] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #108] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: str w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #80] +; NONEON-NOSVE-NEXT: str x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #64] +; NONEON-NOSVE-NEXT: strb w9, [sp, #110] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_34 +; NONEON-NOSVE-NEXT: b .LBB3_35 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer) ret <32 x i8> %load } @@ -638,27 +1642,36 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI4_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v1.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB4_3 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB4_4 -; NONEON-NOSVE-NEXT: .LBB4_2: // %else2 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB4_3: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB4_2 -; NONEON-NOSVE-NEXT: .LBB4_4: // %cond.load1 -; NONEON-NOSVE-NEXT: add x8, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB4_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: ldr h1, [x0] +; NONEON-NOSVE-NEXT: str h1, [sp, #24] +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB4_3 +; NONEON-NOSVE-NEXT: b .LBB4_4 +; NONEON-NOSVE-NEXT: .LBB4_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI4_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI4_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB4_4 +; NONEON-NOSVE-NEXT: .LBB4_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #18] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: .LBB4_4: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %load = call <2 x half> @llvm.masked.load.v2f16(ptr %src, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer) ret <2 x half> %load @@ -678,39 +1691,84 @@ define <4 x half> @masked_load_v4f16(ptr %src, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI5_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI5_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h1, v0.4h -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB5_5 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB5_6 -; NONEON-NOSVE-NEXT: .LBB5_2: // %else2 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: str d0, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #116] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #118] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #112] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB5_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: ldr h1, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [sp, #106] +; NONEON-NOSVE-NEXT: str h1, [sp, #104] +; NONEON-NOSVE-NEXT: str h0, [sp, #110] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #104] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB5_3 +; NONEON-NOSVE-NEXT: b .LBB5_4 +; NONEON-NOSVE-NEXT: .LBB5_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI5_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI5_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB5_4 +; NONEON-NOSVE-NEXT: .LBB5_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #84] +; NONEON-NOSVE-NEXT: str h1, [sp, #96] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #96] +; NONEON-NOSVE-NEXT: str w9, [sp, #92] +; NONEON-NOSVE-NEXT: str d0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #80] +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #72] +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] +; NONEON-NOSVE-NEXT: .LBB5_4: // %else2 ; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB5_7 -; NONEON-NOSVE-NEXT: .LBB5_3: // %else5 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB5_8 -; NONEON-NOSVE-NEXT: .LBB5_4: // %else8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: .LBB5_6: // %else8 +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB5_5: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB5_2 -; NONEON-NOSVE-NEXT: .LBB5_6: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB5_3 ; NONEON-NOSVE-NEXT: .LBB5_7: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB5_4 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: str h1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: str w9, [sp, #56] +; NONEON-NOSVE-NEXT: str d0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB5_6 ; NONEON-NOSVE-NEXT: .LBB5_8: // %cond.load7 -; NONEON-NOSVE-NEXT: add x8, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #6] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %load = call <4 x half> @llvm.masked.load.v4f16(ptr %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer) ret <4 x half> %load @@ -731,62 +1789,184 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI6_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI6_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b1, v0.8b -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB6_9 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB6_10 -; NONEON-NOSVE-NEXT: .LBB6_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB6_11 -; NONEON-NOSVE-NEXT: .LBB6_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB6_12 -; NONEON-NOSVE-NEXT: .LBB6_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB6_13 -; NONEON-NOSVE-NEXT: .LBB6_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB6_14 -; NONEON-NOSVE-NEXT: .LBB6_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB6_15 -; NONEON-NOSVE-NEXT: .LBB6_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB6_16 -; NONEON-NOSVE-NEXT: .LBB6_8: // %else20 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB6_9: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB6_2 -; NONEON-NOSVE-NEXT: .LBB6_10: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB6_3 -; NONEON-NOSVE-NEXT: .LBB6_11: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB6_4 -; NONEON-NOSVE-NEXT: .LBB6_12: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB6_5 -; NONEON-NOSVE-NEXT: .LBB6_13: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB6_6 -; NONEON-NOSVE-NEXT: .LBB6_14: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #10 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB6_7 -; NONEON-NOSVE-NEXT: .LBB6_15: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB6_8 -; NONEON-NOSVE-NEXT: .LBB6_16: // %cond.load19 -; NONEON-NOSVE-NEXT: add x8, x0, #14 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[7], [x8] +; NONEON-NOSVE-NEXT: sub sp, sp, #496 +; NONEON-NOSVE-NEXT: str x29, [sp, #480] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 496 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str d0, [sp, #464] +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #466] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #467] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #465] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #468] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #469] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #470] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #464] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #471] +; NONEON-NOSVE-NEXT: and w11, w11, #0x2 +; NONEON-NOSVE-NEXT: and w13, w13, #0x10 +; NONEON-NOSVE-NEXT: bfxil w11, w12, #0, #1 +; NONEON-NOSVE-NEXT: and w12, w14, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w13 +; NONEON-NOSVE-NEXT: and w13, w15, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w11, w8 +; NONEON-NOSVE-NEXT: orr w11, w12, w13 +; NONEON-NOSVE-NEXT: orr w8, w8, w11 +; NONEON-NOSVE-NEXT: and w10, w10, #0x80 +; NONEON-NOSVE-NEXT: add w10, w8, w10 +; NONEON-NOSVE-NEXT: and w8, w10, #0xff +; NONEON-NOSVE-NEXT: tbz w10, #0, .LBB6_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: ldr h1, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #250] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #242] +; NONEON-NOSVE-NEXT: str h1, [sp, #448] +; NONEON-NOSVE-NEXT: str h0, [sp, #462] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #448] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB6_3 +; NONEON-NOSVE-NEXT: b .LBB6_4 +; NONEON-NOSVE-NEXT: .LBB6_2: +; NONEON-NOSVE-NEXT: adrp x10, .LCPI6_0 +; NONEON-NOSVE-NEXT: ldr q0, [x10, :lo12:.LCPI6_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB6_4 +; NONEON-NOSVE-NEXT: .LBB6_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #2] +; NONEON-NOSVE-NEXT: str q0, [sp, #400] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #412] +; NONEON-NOSVE-NEXT: str h1, [sp, #432] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #432] +; NONEON-NOSVE-NEXT: str w10, [sp, #428] +; NONEON-NOSVE-NEXT: str q0, [sp, #384] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #196] +; NONEON-NOSVE-NEXT: stur x10, [x9, #212] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #400] +; NONEON-NOSVE-NEXT: str h0, [sp, #416] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #384] +; NONEON-NOSVE-NEXT: str h0, [sp, #418] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #416] +; NONEON-NOSVE-NEXT: .LBB6_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB6_12 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB6_13 +; NONEON-NOSVE-NEXT: .LBB6_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB6_14 +; NONEON-NOSVE-NEXT: .LBB6_7: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB6_15 +; NONEON-NOSVE-NEXT: .LBB6_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB6_16 +; NONEON-NOSVE-NEXT: .LBB6_9: // %else17 +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB6_11 +; NONEON-NOSVE-NEXT: .LBB6_10: // %cond.load19 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #14] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB6_11: // %else20 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #480] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #496 ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB6_12: // %cond.load4 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #336] +; NONEON-NOSVE-NEXT: str h1, [sp, #368] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #368] +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #350] +; NONEON-NOSVE-NEXT: str h0, [sp, #366] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #134] +; NONEON-NOSVE-NEXT: stur x10, [x9, #150] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #336] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #320] +; NONEON-NOSVE-NEXT: str w10, [sp, #352] +; NONEON-NOSVE-NEXT: str h0, [sp, #356] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB6_6 +; NONEON-NOSVE-NEXT: .LBB6_13: // %cond.load7 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #6] +; NONEON-NOSVE-NEXT: str q0, [sp, #272] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #280] +; NONEON-NOSVE-NEXT: str h1, [sp, #304] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #304] +; NONEON-NOSVE-NEXT: str x10, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #272] +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #276] +; NONEON-NOSVE-NEXT: str w10, [sp, #288] +; NONEON-NOSVE-NEXT: str h0, [sp, #292] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #256] +; NONEON-NOSVE-NEXT: str h0, [sp, #294] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB6_7 +; NONEON-NOSVE-NEXT: .LBB6_14: // %cond.load10 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: str h1, [sp, #240] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #240] +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #222] +; NONEON-NOSVE-NEXT: str h0, [sp, #238] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #10] +; NONEON-NOSVE-NEXT: stur w10, [x9, #26] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #208] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #192] +; NONEON-NOSVE-NEXT: str x9, [sp, #224] +; NONEON-NOSVE-NEXT: str h0, [sp, #232] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB6_8 +; NONEON-NOSVE-NEXT: .LBB6_15: // %cond.load13 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #10] +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #156] +; NONEON-NOSVE-NEXT: str h1, [sp, #176] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #176] +; NONEON-NOSVE-NEXT: str w9, [sp, #172] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #144] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #152] +; NONEON-NOSVE-NEXT: str x9, [sp, #160] +; NONEON-NOSVE-NEXT: str h0, [sp, #168] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #128] +; NONEON-NOSVE-NEXT: str h0, [sp, #170] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB6_9 +; NONEON-NOSVE-NEXT: .LBB6_16: // %cond.load16 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: str h1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #112] +; NONEON-NOSVE-NEXT: str w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #80] +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #94] +; NONEON-NOSVE-NEXT: str x9, [sp, #96] +; NONEON-NOSVE-NEXT: str h0, [sp, #110] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #64] +; NONEON-NOSVE-NEXT: str h0, [sp, #108] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB6_10 +; NONEON-NOSVE-NEXT: b .LBB6_11 %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer) ret <8 x half> %load } @@ -814,113 +1994,383 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI7_0 -; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #1024 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1040 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp, #976] +; NONEON-NOSVE-NEXT: adrp x9, .LCPI7_0 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #984] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1000] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #976] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #992] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #991] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1007] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #990] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1006] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #989] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1005] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #988] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1004] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #987] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1003] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #986] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1002] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #985] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1001] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #983] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #999] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #982] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #998] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #981] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #997] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #980] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #996] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #979] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #995] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #978] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #994] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #977] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #993] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #992] ; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: addv h2, v0.8h -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s2 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB7_17 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_18 -; NONEON-NOSVE-NEXT: .LBB7_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB7_19 -; NONEON-NOSVE-NEXT: .LBB7_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB7_20 -; NONEON-NOSVE-NEXT: .LBB7_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB7_21 -; NONEON-NOSVE-NEXT: .LBB7_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB7_22 -; NONEON-NOSVE-NEXT: .LBB7_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB7_23 -; NONEON-NOSVE-NEXT: .LBB7_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB7_24 -; NONEON-NOSVE-NEXT: .LBB7_8: // %else20 -; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB7_25 -; NONEON-NOSVE-NEXT: .LBB7_9: // %else23 -; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB7_26 -; NONEON-NOSVE-NEXT: .LBB7_10: // %else26 -; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB7_27 -; NONEON-NOSVE-NEXT: .LBB7_11: // %else29 -; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB7_28 -; NONEON-NOSVE-NEXT: .LBB7_12: // %else32 -; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB7_29 -; NONEON-NOSVE-NEXT: .LBB7_13: // %else35 -; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB7_30 -; NONEON-NOSVE-NEXT: .LBB7_14: // %else38 -; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB7_31 -; NONEON-NOSVE-NEXT: .LBB7_15: // %else41 -; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_32 -; NONEON-NOSVE-NEXT: .LBB7_16: // %else44 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB7_17: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB7_2 -; NONEON-NOSVE-NEXT: .LBB7_18: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB7_3 -; NONEON-NOSVE-NEXT: .LBB7_19: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB7_4 -; NONEON-NOSVE-NEXT: .LBB7_20: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB7_5 -; NONEON-NOSVE-NEXT: .LBB7_21: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB7_6 -; NONEON-NOSVE-NEXT: .LBB7_22: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #10 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB7_7 -; NONEON-NOSVE-NEXT: .LBB7_23: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB7_8 -; NONEON-NOSVE-NEXT: .LBB7_24: // %cond.load19 -; NONEON-NOSVE-NEXT: add x9, x0, #14 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[7], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB7_9 -; NONEON-NOSVE-NEXT: .LBB7_25: // %cond.load22 -; NONEON-NOSVE-NEXT: add x9, x0, #16 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[0], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB7_10 -; NONEON-NOSVE-NEXT: .LBB7_26: // %cond.load25 -; NONEON-NOSVE-NEXT: add x9, x0, #18 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB7_11 -; NONEON-NOSVE-NEXT: .LBB7_27: // %cond.load28 -; NONEON-NOSVE-NEXT: add x9, x0, #20 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB7_12 -; NONEON-NOSVE-NEXT: .LBB7_28: // %cond.load31 -; NONEON-NOSVE-NEXT: add x9, x0, #22 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB7_13 -; NONEON-NOSVE-NEXT: .LBB7_29: // %cond.load34 -; NONEON-NOSVE-NEXT: add x9, x0, #24 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB7_14 -; NONEON-NOSVE-NEXT: .LBB7_30: // %cond.load37 -; NONEON-NOSVE-NEXT: add x9, x0, #26 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB7_15 -; NONEON-NOSVE-NEXT: .LBB7_31: // %cond.load40 -; NONEON-NOSVE-NEXT: add x9, x0, #28 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB7_16 -; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.load43 -; NONEON-NOSVE-NEXT: add x8, x0, #30 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[7], [x8] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI7_0] +; NONEON-NOSVE-NEXT: add x9, sp, #720 +; NONEON-NOSVE-NEXT: str q0, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #1010] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1012] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #1014] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #1016] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #1018] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #1020] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: add w10, w11, w12 +; NONEON-NOSVE-NEXT: add w11, w13, w14 +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: add w10, w11, w15 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1022] +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w11 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB7_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: ldr h2, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #250] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #242] +; NONEON-NOSVE-NEXT: str h2, [sp, #960] +; NONEON-NOSVE-NEXT: str h0, [sp, #974] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #960] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_3 +; NONEON-NOSVE-NEXT: b .LBB7_4 +; NONEON-NOSVE-NEXT: .LBB7_2: +; NONEON-NOSVE-NEXT: mov v0.16b, v1.16b +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB7_4 +; NONEON-NOSVE-NEXT: .LBB7_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #2] +; NONEON-NOSVE-NEXT: str q0, [sp, #912] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #924] +; NONEON-NOSVE-NEXT: str h2, [sp, #944] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #944] +; NONEON-NOSVE-NEXT: str w10, [sp, #940] +; NONEON-NOSVE-NEXT: str q0, [sp, #896] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #196] +; NONEON-NOSVE-NEXT: stur x10, [x9, #212] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #912] +; NONEON-NOSVE-NEXT: str h0, [sp, #928] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #896] +; NONEON-NOSVE-NEXT: str h0, [sp, #930] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #928] +; NONEON-NOSVE-NEXT: .LBB7_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB7_20 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB7_21 +; NONEON-NOSVE-NEXT: .LBB7_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB7_22 +; NONEON-NOSVE-NEXT: .LBB7_7: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB7_23 +; NONEON-NOSVE-NEXT: .LBB7_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB7_24 +; NONEON-NOSVE-NEXT: .LBB7_9: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB7_25 +; NONEON-NOSVE-NEXT: .LBB7_10: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB7_26 +; NONEON-NOSVE-NEXT: .LBB7_11: // %else23 +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB7_27 +; NONEON-NOSVE-NEXT: .LBB7_12: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB7_28 +; NONEON-NOSVE-NEXT: .LBB7_13: // %else29 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB7_29 +; NONEON-NOSVE-NEXT: .LBB7_14: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB7_30 +; NONEON-NOSVE-NEXT: .LBB7_15: // %else35 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB7_31 +; NONEON-NOSVE-NEXT: .LBB7_16: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB7_32 +; NONEON-NOSVE-NEXT: .LBB7_17: // %else41 +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB7_19 +; NONEON-NOSVE-NEXT: .LBB7_18: // %cond.load43 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #30] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str h1, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: str h1, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB7_19: // %else44 +; NONEON-NOSVE-NEXT: add sp, sp, #1024 +; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB7_20: // %cond.load4 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #848] +; NONEON-NOSVE-NEXT: str h2, [sp, #880] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #880] +; NONEON-NOSVE-NEXT: str q0, [sp, #832] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #862] +; NONEON-NOSVE-NEXT: str h0, [sp, #878] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #134] +; NONEON-NOSVE-NEXT: stur x10, [x9, #150] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #848] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #832] +; NONEON-NOSVE-NEXT: str w10, [sp, #864] +; NONEON-NOSVE-NEXT: str h0, [sp, #868] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #864] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB7_6 +; NONEON-NOSVE-NEXT: .LBB7_21: // %cond.load7 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #6] +; NONEON-NOSVE-NEXT: str q0, [sp, #784] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #792] +; NONEON-NOSVE-NEXT: str h2, [sp, #816] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #816] +; NONEON-NOSVE-NEXT: str x10, [sp, #808] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #784] +; NONEON-NOSVE-NEXT: str q0, [sp, #768] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #788] +; NONEON-NOSVE-NEXT: str w10, [sp, #800] +; NONEON-NOSVE-NEXT: str h0, [sp, #804] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #768] +; NONEON-NOSVE-NEXT: str h0, [sp, #806] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #800] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB7_7 +; NONEON-NOSVE-NEXT: .LBB7_22: // %cond.load10 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #720] +; NONEON-NOSVE-NEXT: str h2, [sp, #752] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #752] +; NONEON-NOSVE-NEXT: str q0, [sp, #704] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #734] +; NONEON-NOSVE-NEXT: str h0, [sp, #750] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #10] +; NONEON-NOSVE-NEXT: stur w10, [x9, #26] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #720] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #704] +; NONEON-NOSVE-NEXT: str x9, [sp, #736] +; NONEON-NOSVE-NEXT: str h0, [sp, #744] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #736] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB7_8 +; NONEON-NOSVE-NEXT: .LBB7_23: // %cond.load13 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #10] +; NONEON-NOSVE-NEXT: str q0, [sp, #656] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #668] +; NONEON-NOSVE-NEXT: str h2, [sp, #688] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #688] +; NONEON-NOSVE-NEXT: str w9, [sp, #684] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #656] +; NONEON-NOSVE-NEXT: str q0, [sp, #640] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #664] +; NONEON-NOSVE-NEXT: str x9, [sp, #672] +; NONEON-NOSVE-NEXT: str h0, [sp, #680] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #640] +; NONEON-NOSVE-NEXT: str h0, [sp, #682] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #672] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB7_9 +; NONEON-NOSVE-NEXT: .LBB7_24: // %cond.load16 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #592] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #600] +; NONEON-NOSVE-NEXT: str h2, [sp, #624] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #624] +; NONEON-NOSVE-NEXT: str w9, [sp, #616] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #592] +; NONEON-NOSVE-NEXT: str q0, [sp, #576] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #606] +; NONEON-NOSVE-NEXT: str x9, [sp, #608] +; NONEON-NOSVE-NEXT: str h0, [sp, #622] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #576] +; NONEON-NOSVE-NEXT: str h0, [sp, #620] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #608] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB7_10 +; NONEON-NOSVE-NEXT: .LBB7_25: // %cond.load19 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #14] +; NONEON-NOSVE-NEXT: str q0, [sp, #512] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #520] +; NONEON-NOSVE-NEXT: str h2, [sp, #560] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #560] +; NONEON-NOSVE-NEXT: str w9, [sp, #552] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #512] +; NONEON-NOSVE-NEXT: str q0, [sp, #528] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #524] +; NONEON-NOSVE-NEXT: str x9, [sp, #544] +; NONEON-NOSVE-NEXT: str h0, [sp, #556] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #528] +; NONEON-NOSVE-NEXT: str h0, [sp, #558] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #544] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB7_11 +; NONEON-NOSVE-NEXT: .LBB7_26: // %cond.load22 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #464] +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: str h2, [sp, #496] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #496] +; NONEON-NOSVE-NEXT: str q1, [sp, #448] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #478] +; NONEON-NOSVE-NEXT: str h1, [sp, #494] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #10] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #2] +; NONEON-NOSVE-NEXT: stur w10, [x9, #26] +; NONEON-NOSVE-NEXT: stur x11, [x9, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #448] +; NONEON-NOSVE-NEXT: str h1, [sp, #480] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #480] +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB7_12 +; NONEON-NOSVE-NEXT: .LBB7_27: // %cond.load25 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #18] +; NONEON-NOSVE-NEXT: str q1, [sp, #400] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #412] +; NONEON-NOSVE-NEXT: str h2, [sp, #432] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #432] +; NONEON-NOSVE-NEXT: str w10, [sp, #428] +; NONEON-NOSVE-NEXT: str q1, [sp, #384] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #196] +; NONEON-NOSVE-NEXT: stur x10, [x9, #212] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #400] +; NONEON-NOSVE-NEXT: str h1, [sp, #416] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #384] +; NONEON-NOSVE-NEXT: str h1, [sp, #418] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #416] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB7_13 +; NONEON-NOSVE-NEXT: .LBB7_28: // %cond.load28 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #20] +; NONEON-NOSVE-NEXT: str q1, [sp, #336] +; NONEON-NOSVE-NEXT: str h2, [sp, #368] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #368] +; NONEON-NOSVE-NEXT: str q1, [sp, #320] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #350] +; NONEON-NOSVE-NEXT: str h1, [sp, #366] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #134] +; NONEON-NOSVE-NEXT: stur x10, [x9, #150] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #336] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #320] +; NONEON-NOSVE-NEXT: str w10, [sp, #352] +; NONEON-NOSVE-NEXT: str h1, [sp, #356] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB7_14 +; NONEON-NOSVE-NEXT: .LBB7_29: // %cond.load31 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #22] +; NONEON-NOSVE-NEXT: str q1, [sp, #272] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #280] +; NONEON-NOSVE-NEXT: str h2, [sp, #304] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #304] +; NONEON-NOSVE-NEXT: str x10, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #272] +; NONEON-NOSVE-NEXT: str q1, [sp, #256] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #276] +; NONEON-NOSVE-NEXT: str w10, [sp, #288] +; NONEON-NOSVE-NEXT: str h1, [sp, #292] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #256] +; NONEON-NOSVE-NEXT: str h1, [sp, #294] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB7_15 +; NONEON-NOSVE-NEXT: .LBB7_30: // %cond.load34 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #24] +; NONEON-NOSVE-NEXT: str q1, [sp, #208] +; NONEON-NOSVE-NEXT: str h2, [sp, #240] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #240] +; NONEON-NOSVE-NEXT: str q1, [sp, #192] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #222] +; NONEON-NOSVE-NEXT: str h1, [sp, #238] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #10] +; NONEON-NOSVE-NEXT: stur w10, [x9, #26] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #208] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #192] +; NONEON-NOSVE-NEXT: str x9, [sp, #224] +; NONEON-NOSVE-NEXT: str h1, [sp, #232] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB7_16 +; NONEON-NOSVE-NEXT: .LBB7_31: // %cond.load37 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #26] +; NONEON-NOSVE-NEXT: str q1, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #156] +; NONEON-NOSVE-NEXT: str h2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #176] +; NONEON-NOSVE-NEXT: str w9, [sp, #172] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #144] +; NONEON-NOSVE-NEXT: str q1, [sp, #128] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #152] +; NONEON-NOSVE-NEXT: str x9, [sp, #160] +; NONEON-NOSVE-NEXT: str h1, [sp, #168] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #128] +; NONEON-NOSVE-NEXT: str h1, [sp, #170] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB7_17 +; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.load40 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #28] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: str h2, [sp, #112] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #112] +; NONEON-NOSVE-NEXT: str w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #80] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #94] +; NONEON-NOSVE-NEXT: str x9, [sp, #96] +; NONEON-NOSVE-NEXT: str h1, [sp, #110] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #64] +; NONEON-NOSVE-NEXT: str h1, [sp, #108] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_18 +; NONEON-NOSVE-NEXT: b .LBB7_19 %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer) ret <16 x half> %load } @@ -939,27 +2389,38 @@ define <2 x float> @masked_load_v2f32(ptr %src, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI8_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v1.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB8_3 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB8_4 -; NONEON-NOSVE-NEXT: .LBB8_2: // %else2 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB8_3: // %cond.load +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #48] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB8_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load ; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB8_2 -; NONEON-NOSVE-NEXT: .LBB8_4: // %cond.load1 -; NONEON-NOSVE-NEXT: add x8, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[1], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str wzr, [sp, #44] +; NONEON-NOSVE-NEXT: str s0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB8_3 +; NONEON-NOSVE-NEXT: b .LBB8_4 +; NONEON-NOSVE-NEXT: .LBB8_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI8_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI8_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB8_4 +; NONEON-NOSVE-NEXT: .LBB8_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr s1, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: str s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: .LBB8_4: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %load = call <2 x float> @llvm.masked.load.v2f32(ptr %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer) ret <2 x float> %load @@ -980,37 +2441,80 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI9_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI9_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h1, v0.4h -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB9_5 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB9_6 -; NONEON-NOSVE-NEXT: .LBB9_2: // %else2 +; NONEON-NOSVE-NEXT: sub sp, sp, #224 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 +; NONEON-NOSVE-NEXT: str d0, [sp, #208] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #212] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #214] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #208] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB9_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: str wzr, [sp, #204] +; NONEON-NOSVE-NEXT: stur xzr, [sp, #196] +; NONEON-NOSVE-NEXT: str s0, [sp, #192] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #192] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB9_3 +; NONEON-NOSVE-NEXT: b .LBB9_4 +; NONEON-NOSVE-NEXT: .LBB9_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI9_0 +; NONEON-NOSVE-NEXT: ldr q0, [x9, :lo12:.LCPI9_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB9_4 +; NONEON-NOSVE-NEXT: .LBB9_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr s1, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: str s1, [sp, #176] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #176] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #152] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #168] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #144] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #128] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #160] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #160] +; NONEON-NOSVE-NEXT: .LBB9_4: // %else2 ; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB9_7 -; NONEON-NOSVE-NEXT: .LBB9_3: // %else5 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB9_8 -; NONEON-NOSVE-NEXT: .LBB9_4: // %else8 +; NONEON-NOSVE-NEXT: .LBB9_6: // %else8 +; NONEON-NOSVE-NEXT: add sp, sp, #224 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB9_5: // %cond.load -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB9_2 -; NONEON-NOSVE-NEXT: .LBB9_6: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB9_3 ; NONEON-NOSVE-NEXT: .LBB9_7: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB9_4 +; NONEON-NOSVE-NEXT: ldr s1, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: str s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #92] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #112] +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #80] +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #96] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #64] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #96] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB9_6 ; NONEON-NOSVE-NEXT: .LBB9_8: // %cond.load7 -; NONEON-NOSVE-NEXT: add x8, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[3], [x8] +; NONEON-NOSVE-NEXT: ldr s1, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #224 ; NONEON-NOSVE-NEXT: ret %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer) ret <4 x float> %load @@ -1064,63 +2568,170 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 +; NONEON-NOSVE-NEXT: sub sp, sp, #496 +; NONEON-NOSVE-NEXT: str x29, [sp, #480] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 496 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str d0, [sp, #464] ; NONEON-NOSVE-NEXT: adrp x8, .LCPI10_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: addv b2, v0.8b -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s2 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB10_9 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB10_10 -; NONEON-NOSVE-NEXT: .LBB10_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB10_11 -; NONEON-NOSVE-NEXT: .LBB10_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB10_12 -; NONEON-NOSVE-NEXT: .LBB10_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB10_13 -; NONEON-NOSVE-NEXT: .LBB10_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB10_14 -; NONEON-NOSVE-NEXT: .LBB10_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB10_15 -; NONEON-NOSVE-NEXT: .LBB10_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB10_16 -; NONEON-NOSVE-NEXT: .LBB10_8: // %else20 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB10_9: // %cond.load +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #466] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #467] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #465] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #468] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #469] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #470] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #464] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #471] +; NONEON-NOSVE-NEXT: and w11, w11, #0x2 +; NONEON-NOSVE-NEXT: and w13, w13, #0x10 +; NONEON-NOSVE-NEXT: bfxil w11, w12, #0, #1 +; NONEON-NOSVE-NEXT: and w12, w14, #0x20 +; NONEON-NOSVE-NEXT: orr w9, w9, w13 +; NONEON-NOSVE-NEXT: and w13, w15, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w11, w9 +; NONEON-NOSVE-NEXT: orr w11, w12, w13 +; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI10_0] +; NONEON-NOSVE-NEXT: orr w9, w9, w11 +; NONEON-NOSVE-NEXT: and w10, w10, #0x80 +; NONEON-NOSVE-NEXT: add w10, w9, w10 +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: and w8, w10, #0xff +; NONEON-NOSVE-NEXT: tbz w10, #0, .LBB10_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load ; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB10_2 -; NONEON-NOSVE-NEXT: .LBB10_10: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB10_3 -; NONEON-NOSVE-NEXT: .LBB10_11: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB10_4 -; NONEON-NOSVE-NEXT: .LBB10_12: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB10_5 -; NONEON-NOSVE-NEXT: .LBB10_13: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #16 -; NONEON-NOSVE-NEXT: ld1 { v1.s }[0], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB10_6 -; NONEON-NOSVE-NEXT: .LBB10_14: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #20 -; NONEON-NOSVE-NEXT: ld1 { v1.s }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB10_7 -; NONEON-NOSVE-NEXT: .LBB10_15: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #24 -; NONEON-NOSVE-NEXT: ld1 { v1.s }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB10_8 -; NONEON-NOSVE-NEXT: .LBB10_16: // %cond.load19 -; NONEON-NOSVE-NEXT: add x8, x0, #28 -; NONEON-NOSVE-NEXT: ld1 { v1.s }[3], [x8] +; NONEON-NOSVE-NEXT: str wzr, [sp, #460] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #244] +; NONEON-NOSVE-NEXT: str s0, [sp, #448] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #448] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB10_3 +; NONEON-NOSVE-NEXT: b .LBB10_4 +; NONEON-NOSVE-NEXT: .LBB10_2: +; NONEON-NOSVE-NEXT: mov v0.16b, v1.16b +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB10_4 +; NONEON-NOSVE-NEXT: .LBB10_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #400] +; NONEON-NOSVE-NEXT: str s2, [sp, #432] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #432] +; NONEON-NOSVE-NEXT: str q0, [sp, #384] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #412] +; NONEON-NOSVE-NEXT: str s0, [sp, #428] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #408] +; NONEON-NOSVE-NEXT: str s0, [sp, #424] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #400] +; NONEON-NOSVE-NEXT: str s0, [sp, #416] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #384] +; NONEON-NOSVE-NEXT: str s0, [sp, #420] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #416] +; NONEON-NOSVE-NEXT: .LBB10_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB10_12 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB10_13 +; NONEON-NOSVE-NEXT: .LBB10_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB10_14 +; NONEON-NOSVE-NEXT: .LBB10_7: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB10_15 +; NONEON-NOSVE-NEXT: .LBB10_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB10_16 +; NONEON-NOSVE-NEXT: .LBB10_9: // %else17 +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB10_11 +; NONEON-NOSVE-NEXT: .LBB10_10: // %cond.load19 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #28] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str s2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp s2, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB10_11: // %else20 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #480] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #496 ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB10_12: // %cond.load4 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #336] +; NONEON-NOSVE-NEXT: str s2, [sp, #368] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #368] +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #348] +; NONEON-NOSVE-NEXT: str s0, [sp, #364] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #340] +; NONEON-NOSVE-NEXT: str s0, [sp, #356] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #336] +; NONEON-NOSVE-NEXT: str s0, [sp, #352] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #320] +; NONEON-NOSVE-NEXT: str s0, [sp, #360] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB10_6 +; NONEON-NOSVE-NEXT: .LBB10_13: // %cond.load7 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #256] +; NONEON-NOSVE-NEXT: str s2, [sp, #304] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #304] +; NONEON-NOSVE-NEXT: str x10, [sp, #288] +; NONEON-NOSVE-NEXT: str q0, [sp, #272] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #264] +; NONEON-NOSVE-NEXT: str s0, [sp, #296] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #272] +; NONEON-NOSVE-NEXT: str s0, [sp, #300] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB10_7 +; NONEON-NOSVE-NEXT: .LBB10_14: // %cond.load10 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #208] +; NONEON-NOSVE-NEXT: str s2, [sp, #240] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #240] +; NONEON-NOSVE-NEXT: str q1, [sp, #192] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #220] +; NONEON-NOSVE-NEXT: str s1, [sp, #236] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #4] +; NONEON-NOSVE-NEXT: stur x10, [x9, #20] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #192] +; NONEON-NOSVE-NEXT: str s1, [sp, #224] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB10_8 +; NONEON-NOSVE-NEXT: .LBB10_15: // %cond.load13 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #20] +; NONEON-NOSVE-NEXT: str q1, [sp, #144] +; NONEON-NOSVE-NEXT: str s2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #176] +; NONEON-NOSVE-NEXT: str q1, [sp, #128] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #152] +; NONEON-NOSVE-NEXT: stp s1, s2, [sp, #168] +; NONEON-NOSVE-NEXT: ldr s2, [sp, #144] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #128] +; NONEON-NOSVE-NEXT: stp s2, s1, [sp, #160] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB10_9 +; NONEON-NOSVE-NEXT: .LBB10_16: // %cond.load16 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #24] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: str s2, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s2, [sp, #92] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #112] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #80] +; NONEON-NOSVE-NEXT: stp s1, s3, [sp, #96] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #64] +; NONEON-NOSVE-NEXT: stp s1, s2, [sp, #104] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB10_10 +; NONEON-NOSVE-NEXT: b .LBB10_11 %load = call <8 x float> @llvm.masked.load.v8f32(ptr %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer) ret <8 x float> %load } @@ -1140,25 +2751,38 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI11_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI11_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v1.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB11_3 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB11_4 -; NONEON-NOSVE-NEXT: .LBB11_2: // %else2 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB11_3: // %cond.load +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: str d0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #80] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB11_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB11_2 -; NONEON-NOSVE-NEXT: .LBB11_4: // %cond.load1 -; NONEON-NOSVE-NEXT: add x8, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.d }[1], [x8] +; NONEON-NOSVE-NEXT: str xzr, [sp, #72] +; NONEON-NOSVE-NEXT: str d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB11_3 +; NONEON-NOSVE-NEXT: b .LBB11_4 +; NONEON-NOSVE-NEXT: .LBB11_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI11_0 +; NONEON-NOSVE-NEXT: ldr q0, [x9, :lo12:.LCPI11_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB11_4 +; NONEON-NOSVE-NEXT: .LBB11_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr d1, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB11_4: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer) ret <2 x double> %load @@ -1188,38 +2812,74 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI12_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI12_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: addv h2, v0.4h -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s2 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB12_5 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB12_6 -; NONEON-NOSVE-NEXT: .LBB12_2: // %else2 +; NONEON-NOSVE-NEXT: sub sp, sp, #224 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 +; NONEON-NOSVE-NEXT: str d0, [sp, #208] +; NONEON-NOSVE-NEXT: adrp x9, .LCPI12_0 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #212] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #214] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #208] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI12_0] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w10, w10, #0x4 +; NONEON-NOSVE-NEXT: and w11, w11, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w12, #0, #1 +; NONEON-NOSVE-NEXT: orr w10, w10, w11 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB12_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str xzr, [sp, #200] +; NONEON-NOSVE-NEXT: str d0, [sp, #192] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #192] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB12_3 +; NONEON-NOSVE-NEXT: b .LBB12_4 +; NONEON-NOSVE-NEXT: .LBB12_2: +; NONEON-NOSVE-NEXT: mov v0.16b, v1.16b +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB12_4 +; NONEON-NOSVE-NEXT: .LBB12_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr d2, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: str d2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr d2, [sp, #128] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #176] +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #144] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #160] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #160] +; NONEON-NOSVE-NEXT: .LBB12_4: // %else2 ; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB12_7 -; NONEON-NOSVE-NEXT: .LBB12_3: // %else5 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB12_8 -; NONEON-NOSVE-NEXT: .LBB12_4: // %else8 +; NONEON-NOSVE-NEXT: .LBB12_6: // %else8 +; NONEON-NOSVE-NEXT: add sp, sp, #224 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB12_5: // %cond.load -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB12_2 -; NONEON-NOSVE-NEXT: .LBB12_6: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.d }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB12_3 ; NONEON-NOSVE-NEXT: .LBB12_7: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #16 -; NONEON-NOSVE-NEXT: ld1 { v1.d }[0], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB12_4 +; NONEON-NOSVE-NEXT: ldr d2, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: str d2, [sp, #112] +; NONEON-NOSVE-NEXT: ldr d2, [sp, #88] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #112] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #96] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #96] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB12_6 ; NONEON-NOSVE-NEXT: .LBB12_8: // %cond.load7 -; NONEON-NOSVE-NEXT: add x8, x0, #24 -; NONEON-NOSVE-NEXT: ld1 { v1.d }[1], [x8] +; NONEON-NOSVE-NEXT: ldr d2, [x0, #24] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: str d2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d2, [sp] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d2, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #224 ; NONEON-NOSVE-NEXT: ret %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer) ret <4 x double> %load @@ -1249,34 +2909,51 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; ; NONEON-NOSVE-LABEL: masked_load_zext_v3i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #16 -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: and w8, w1, #0x1 ; NONEON-NOSVE-NEXT: bfi w8, w2, #1, #1 ; NONEON-NOSVE-NEXT: bfi w8, w3, #2, #1 ; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB13_2 ; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: ldrh w9, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [sp, #66] +; NONEON-NOSVE-NEXT: strh w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB13_3 ; NONEON-NOSVE-NEXT: b .LBB13_4 ; NONEON-NOSVE-NEXT: .LBB13_2: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI13_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI13_0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB13_4 ; NONEON-NOSVE-NEXT: .LBB13_3: // %cond.load1 -; NONEON-NOSVE-NEXT: mov v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[1], [x9] -; NONEON-NOSVE-NEXT: mov v1.h[2], v0.h[2] -; NONEON-NOSVE-NEXT: fmov d0, d1 +; NONEON-NOSVE-NEXT: ldrh w9, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] ; NONEON-NOSVE-NEXT: .LBB13_4: // %else2 ; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB13_6 ; NONEON-NOSVE-NEXT: // %bb.5: // %cond.load4 -; NONEON-NOSVE-NEXT: mov v0.h[1], v0.h[1] -; NONEON-NOSVE-NEXT: add x8, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: .LBB13_6: // %else5 -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer) %extend = zext <3 x i16> %load_value to <3 x i32> @@ -1307,34 +2984,51 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; ; NONEON-NOSVE-LABEL: masked_load_sext_v3i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #16 -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: and w8, w1, #0x1 ; NONEON-NOSVE-NEXT: bfi w8, w2, #1, #1 ; NONEON-NOSVE-NEXT: bfi w8, w3, #2, #1 ; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB14_2 ; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: ldrh w9, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [sp, #66] +; NONEON-NOSVE-NEXT: strh w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB14_3 ; NONEON-NOSVE-NEXT: b .LBB14_4 ; NONEON-NOSVE-NEXT: .LBB14_2: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI14_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI14_0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB14_4 ; NONEON-NOSVE-NEXT: .LBB14_3: // %cond.load1 -; NONEON-NOSVE-NEXT: mov v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[1], [x9] -; NONEON-NOSVE-NEXT: mov v1.h[2], v0.h[2] -; NONEON-NOSVE-NEXT: fmov d0, d1 +; NONEON-NOSVE-NEXT: ldrh w9, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] ; NONEON-NOSVE-NEXT: .LBB14_4: // %else2 ; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB14_6 ; NONEON-NOSVE-NEXT: // %bb.5: // %cond.load4 -; NONEON-NOSVE-NEXT: mov v0.h[1], v0.h[1] -; NONEON-NOSVE-NEXT: add x8, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: .LBB14_6: // %else5 -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer) %extend = sext <3 x i16> %load_value to <3 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index 0904399558aee1..a79ce9db9abfde 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -23,13 +23,21 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI0_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB0_5 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB0_6 @@ -38,6 +46,7 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB0_3: // %else4 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB0_8 ; NONEON-NOSVE-NEXT: .LBB0_4: // %else6 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB0_5: // %cond.store ; NONEON-NOSVE-NEXT: strb wzr, [x0] @@ -50,6 +59,7 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB0_4 ; NONEON-NOSVE-NEXT: .LBB0_8: // %cond.store5 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #3] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void @@ -69,14 +79,39 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI1_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w8, s0 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB1_9 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w9, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: and w10, w10, #0x2 +; NONEON-NOSVE-NEXT: and w12, w12, #0x10 +; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w11, w13, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w12 +; NONEON-NOSVE-NEXT: and w12, w14, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w12 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: and w9, w9, #0x80 +; NONEON-NOSVE-NEXT: add w9, w8, w9 +; NONEON-NOSVE-NEXT: and w8, w9, #0xff +; NONEON-NOSVE-NEXT: tbnz w9, #0, .LBB1_9 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB1_10 ; NONEON-NOSVE-NEXT: .LBB1_2: // %else2 @@ -92,6 +127,7 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB1_7: // %else12 ; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB1_16 ; NONEON-NOSVE-NEXT: .LBB1_8: // %else14 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB1_9: // %cond.store ; NONEON-NOSVE-NEXT: strb wzr, [x0] @@ -116,6 +152,7 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB1_8 ; NONEON-NOSVE-NEXT: .LBB1_16: // %cond.store13 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #7] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void @@ -135,15 +172,89 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 -; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w12, w13 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w14 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w10 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB2_17 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_18 @@ -176,6 +287,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB2_15: // %else28 ; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_32 ; NONEON-NOSVE-NEXT: .LBB2_16: // %else30 +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB2_17: // %cond.store ; NONEON-NOSVE-NEXT: strb wzr, [x0] @@ -224,6 +336,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_16 ; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.store29 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #15] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask) ret void @@ -308,241 +421,328 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] -; NONEON-NOSVE-NEXT: fmov s1, w1 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] -; NONEON-NOSVE-NEXT: mov v1.b[1], w2 -; NONEON-NOSVE-NEXT: mov v0.b[1], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp] -; NONEON-NOSVE-NEXT: mov v1.b[2], w3 -; NONEON-NOSVE-NEXT: mov v0.b[2], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] -; NONEON-NOSVE-NEXT: mov v1.b[3], w4 -; NONEON-NOSVE-NEXT: mov v0.b[3], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] -; NONEON-NOSVE-NEXT: mov v1.b[4], w5 -; NONEON-NOSVE-NEXT: mov v0.b[4], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] -; NONEON-NOSVE-NEXT: mov v1.b[5], w6 -; NONEON-NOSVE-NEXT: mov v0.b[5], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] -; NONEON-NOSVE-NEXT: mov v1.b[6], w7 -; NONEON-NOSVE-NEXT: mov v0.b[6], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] -; NONEON-NOSVE-NEXT: mov v1.b[7], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] -; NONEON-NOSVE-NEXT: mov v0.b[7], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] -; NONEON-NOSVE-NEXT: mov v1.b[8], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] -; NONEON-NOSVE-NEXT: mov v0.b[8], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] -; NONEON-NOSVE-NEXT: mov v1.b[9], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] -; NONEON-NOSVE-NEXT: mov v0.b[9], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] -; NONEON-NOSVE-NEXT: mov v1.b[10], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] -; NONEON-NOSVE-NEXT: mov v0.b[10], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] -; NONEON-NOSVE-NEXT: mov v1.b[11], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] -; NONEON-NOSVE-NEXT: mov v0.b[11], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #168] -; NONEON-NOSVE-NEXT: mov v1.b[12], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] -; NONEON-NOSVE-NEXT: mov v0.b[12], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] -; NONEON-NOSVE-NEXT: mov v1.b[13], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #56] -; NONEON-NOSVE-NEXT: mov v0.b[13], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #216] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #160] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, #0x1 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #264] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #256] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #248] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w9, #0x20 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #240] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: and w8, w9, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #232] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #224] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #208] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, #0x2 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #200] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #192] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] ; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] -; NONEON-NOSVE-NEXT: mov v1.b[14], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #64] -; NONEON-NOSVE-NEXT: mov v0.b[14], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] -; NONEON-NOSVE-NEXT: mov v1.b[15], w9 -; NONEON-NOSVE-NEXT: mov v0.b[15], w8 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 -; NONEON-NOSVE-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] -; NONEON-NOSVE-NEXT: shl v1.16b, v1.16b, #7 -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: cmlt v1.16b, v1.16b, #0 -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; NONEON-NOSVE-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: zip1 v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: addv h1, v1.8h -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: bfi w8, w9, #16, #16 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_33 +; NONEON-NOSVE-NEXT: and w9, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: strb w9, [sp, #22] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #168] +; NONEON-NOSVE-NEXT: and w10, w10, #0x20 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w10, [sp, #21] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w11, #0x8 +; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w10, #0x2 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #136] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w9, #0x1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w1, #0x1 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #104] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #80] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #128] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #120] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: and w9, w9, #0x20 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: strb w9, [sp, #13] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #96] +; NONEON-NOSVE-NEXT: and w10, w10, #0x10 +; NONEON-NOSVE-NEXT: zip1 v2.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w10, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: and w8, w11, #0x4 +; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: and w8, w9, #0x2 +; NONEON-NOSVE-NEXT: sbfx w9, w7, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: and w8, w10, #0x80 +; NONEON-NOSVE-NEXT: sbfx w10, w6, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #7] +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w5, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #6] +; NONEON-NOSVE-NEXT: and w8, w10, #0x20 +; NONEON-NOSVE-NEXT: sbfx w10, w4, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #5] +; NONEON-NOSVE-NEXT: and w8, w9, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w3, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #4] +; NONEON-NOSVE-NEXT: and w8, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w10, w2, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #3] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2] +; NONEON-NOSVE-NEXT: and w8, w10, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #40] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #34] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w12, w13 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #42] +; NONEON-NOSVE-NEXT: add w9, w9, w10 +; NONEON-NOSVE-NEXT: add w10, w12, w11 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w12, w13, w14 +; NONEON-NOSVE-NEXT: add w14, w15, w16 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #62] +; NONEON-NOSVE-NEXT: add w10, w10, w12 +; NONEON-NOSVE-NEXT: add w11, w14, w11 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #46] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w8, w13 +; NONEON-NOSVE-NEXT: add w8, w9, w12 +; NONEON-NOSVE-NEXT: bfi w8, w10, #16, #16 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_34 ; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_34 +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_35 ; NONEON-NOSVE-NEXT: .LBB3_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_35 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_36 ; NONEON-NOSVE-NEXT: .LBB3_3: // %else4 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_36 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_37 ; NONEON-NOSVE-NEXT: .LBB3_4: // %else6 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_37 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_38 ; NONEON-NOSVE-NEXT: .LBB3_5: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_38 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_39 ; NONEON-NOSVE-NEXT: .LBB3_6: // %else10 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_39 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_40 ; NONEON-NOSVE-NEXT: .LBB3_7: // %else12 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_40 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_41 ; NONEON-NOSVE-NEXT: .LBB3_8: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_41 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_42 ; NONEON-NOSVE-NEXT: .LBB3_9: // %else16 -; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_42 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_43 ; NONEON-NOSVE-NEXT: .LBB3_10: // %else18 -; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_43 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_44 ; NONEON-NOSVE-NEXT: .LBB3_11: // %else20 -; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_44 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_45 ; NONEON-NOSVE-NEXT: .LBB3_12: // %else22 -; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_45 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_46 ; NONEON-NOSVE-NEXT: .LBB3_13: // %else24 -; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_46 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_47 ; NONEON-NOSVE-NEXT: .LBB3_14: // %else26 -; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_47 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_48 ; NONEON-NOSVE-NEXT: .LBB3_15: // %else28 -; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_48 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_49 ; NONEON-NOSVE-NEXT: .LBB3_16: // %else30 -; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_49 +; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_50 ; NONEON-NOSVE-NEXT: .LBB3_17: // %else32 -; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_50 +; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_51 ; NONEON-NOSVE-NEXT: .LBB3_18: // %else34 -; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_51 +; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_52 ; NONEON-NOSVE-NEXT: .LBB3_19: // %else36 -; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_52 +; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_53 ; NONEON-NOSVE-NEXT: .LBB3_20: // %else38 -; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_53 +; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_54 ; NONEON-NOSVE-NEXT: .LBB3_21: // %else40 -; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_54 +; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_55 ; NONEON-NOSVE-NEXT: .LBB3_22: // %else42 -; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_55 +; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_56 ; NONEON-NOSVE-NEXT: .LBB3_23: // %else44 -; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_56 +; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_57 ; NONEON-NOSVE-NEXT: .LBB3_24: // %else46 -; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_57 +; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_58 ; NONEON-NOSVE-NEXT: .LBB3_25: // %else48 -; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_58 +; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_59 ; NONEON-NOSVE-NEXT: .LBB3_26: // %else50 -; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_59 +; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_60 ; NONEON-NOSVE-NEXT: .LBB3_27: // %else52 -; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_60 +; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_61 ; NONEON-NOSVE-NEXT: .LBB3_28: // %else54 -; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_61 +; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_62 ; NONEON-NOSVE-NEXT: .LBB3_29: // %else56 -; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_62 +; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_63 ; NONEON-NOSVE-NEXT: .LBB3_30: // %else58 -; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_63 +; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_64 ; NONEON-NOSVE-NEXT: .LBB3_31: // %else60 -; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_64 -; NONEON-NOSVE-NEXT: .LBB3_32: // %else62 +; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_33 +; NONEON-NOSVE-NEXT: .LBB3_32: // %cond.store61 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #31] +; NONEON-NOSVE-NEXT: .LBB3_33: // %else62 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB3_33: // %cond.store +; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.store ; NONEON-NOSVE-NEXT: strb wzr, [x0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_2 -; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.store1 +; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.store1 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #1] ; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB3_3 -; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.store3 +; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.store3 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #2] ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_4 -; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.store5 +; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.store5 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #3] ; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_5 -; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.store7 +; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.store7 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #4] ; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_6 -; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.store9 +; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.store9 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #5] ; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_7 -; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.store11 +; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.store11 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #6] ; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_8 -; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.store13 +; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.store13 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #7] ; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_9 -; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.store15 +; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.store15 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #8] ; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_10 -; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.store17 +; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.store17 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #9] ; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_11 -; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.store19 +; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.store19 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #10] ; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_12 -; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.store21 +; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.store21 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #11] ; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_13 -; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.store23 +; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.store23 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #12] ; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_14 -; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.store25 +; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.store25 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #13] ; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_15 -; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.store27 +; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.store27 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #14] ; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_16 -; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.store29 +; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.store29 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #15] ; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_17 -; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.store31 +; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.store31 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #16] ; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_18 -; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.store33 +; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.store33 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #17] ; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_19 -; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.store35 +; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.store35 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #18] ; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_20 -; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.store37 +; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.store37 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #19] ; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_21 -; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.store39 +; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.store39 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #20] ; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_22 -; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.store41 +; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.store41 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #21] ; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_23 -; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.store43 +; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.store43 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #22] ; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_24 -; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.store45 +; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.store45 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #23] ; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_25 -; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.store47 +; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.store47 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #24] ; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_26 -; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.store49 +; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.store49 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #25] ; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_27 -; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.store51 +; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.store51 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #26] ; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_28 -; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.store53 +; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.store53 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #27] ; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_29 -; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.store55 +; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.store55 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #28] ; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_30 -; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.store57 +; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.store57 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #29] ; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_31 -; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.store59 +; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.store59 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #30] -; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_32 -; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.store61 -; NONEON-NOSVE-NEXT: strb wzr, [x0, #31] -; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_32 +; NONEON-NOSVE-NEXT: b .LBB3_33 call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask) ret void } @@ -571,17 +771,18 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI4_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB4_3 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB4_4 ; NONEON-NOSVE-NEXT: .LBB4_2: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB4_3: // %cond.store ; NONEON-NOSVE-NEXT: fmov s0, wzr @@ -590,6 +791,7 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB4_4: // %cond.store1 ; NONEON-NOSVE-NEXT: fmov s0, wzr ; NONEON-NOSVE-NEXT: str h0, [x0, #2] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v2f16(<2 x half> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask) ret void @@ -609,13 +811,21 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI5_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI5_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB5_5 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB5_6 @@ -624,6 +834,7 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB5_3: // %else4 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB5_8 ; NONEON-NOSVE-NEXT: .LBB5_4: // %else6 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB5_5: // %cond.store ; NONEON-NOSVE-NEXT: fmov s0, wzr @@ -640,6 +851,7 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB5_8: // %cond.store5 ; NONEON-NOSVE-NEXT: fmov s0, wzr ; NONEON-NOSVE-NEXT: str h0, [x0, #6] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void @@ -660,14 +872,39 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI6_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI6_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w8, s0 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB6_9 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w9, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: and w10, w10, #0x2 +; NONEON-NOSVE-NEXT: and w12, w12, #0x10 +; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w11, w13, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w12 +; NONEON-NOSVE-NEXT: and w12, w14, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w12 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: and w9, w9, #0x80 +; NONEON-NOSVE-NEXT: add w9, w8, w9 +; NONEON-NOSVE-NEXT: and w8, w9, #0xff +; NONEON-NOSVE-NEXT: tbnz w9, #0, .LBB6_9 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB6_10 ; NONEON-NOSVE-NEXT: .LBB6_2: // %else2 @@ -683,6 +920,7 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB6_7: // %else12 ; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB6_16 ; NONEON-NOSVE-NEXT: .LBB6_8: // %else14 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB6_9: // %cond.store ; NONEON-NOSVE-NEXT: fmov s0, wzr @@ -715,6 +953,7 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB6_16: // %cond.store13 ; NONEON-NOSVE-NEXT: fmov s0, wzr ; NONEON-NOSVE-NEXT: str h0, [x0, #14] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void @@ -743,15 +982,89 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI7_0 -; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w12, w13 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w14 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w10 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB7_17 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_18 @@ -784,6 +1097,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB7_15: // %else28 ; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_32 ; NONEON-NOSVE-NEXT: .LBB7_16: // %else30 +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB7_17: // %cond.store ; NONEON-NOSVE-NEXT: fmov s0, wzr @@ -848,6 +1162,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.store29 ; NONEON-NOSVE-NEXT: fmov s0, wzr ; NONEON-NOSVE-NEXT: str h0, [x0, #30] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask) ret void @@ -868,13 +1183,21 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI8_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB8_5 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB8_6 @@ -883,6 +1206,7 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB8_3: // %else4 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB8_8 ; NONEON-NOSVE-NEXT: .LBB8_4: // %else6 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB8_5: // %cond.store ; NONEON-NOSVE-NEXT: str wzr, [x0] @@ -895,6 +1219,7 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB8_4 ; NONEON-NOSVE-NEXT: .LBB8_8: // %cond.store5 ; NONEON-NOSVE-NEXT: str wzr, [x0, #12] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void @@ -949,14 +1274,39 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI9_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI9_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w8, s0 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB9_9 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w9, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: and w10, w10, #0x2 +; NONEON-NOSVE-NEXT: and w12, w12, #0x10 +; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w11, w13, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w12 +; NONEON-NOSVE-NEXT: and w12, w14, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w12 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: and w9, w9, #0x80 +; NONEON-NOSVE-NEXT: add w9, w8, w9 +; NONEON-NOSVE-NEXT: and w8, w9, #0xff +; NONEON-NOSVE-NEXT: tbnz w9, #0, .LBB9_9 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB9_10 ; NONEON-NOSVE-NEXT: .LBB9_2: // %else2 @@ -972,6 +1322,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB9_7: // %else12 ; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB9_16 ; NONEON-NOSVE-NEXT: .LBB9_8: // %else14 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB9_9: // %cond.store ; NONEON-NOSVE-NEXT: str wzr, [x0] @@ -996,6 +1347,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB9_8 ; NONEON-NOSVE-NEXT: .LBB9_16: // %cond.store13 ; NONEON-NOSVE-NEXT: str wzr, [x0, #28] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void @@ -1016,23 +1368,25 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI10_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB10_3 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB10_4 ; NONEON-NOSVE-NEXT: .LBB10_2: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB10_3: // %cond.store ; NONEON-NOSVE-NEXT: str xzr, [x0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB10_2 ; NONEON-NOSVE-NEXT: .LBB10_4: // %cond.store1 ; NONEON-NOSVE-NEXT: str xzr, [x0, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask) ret void @@ -1061,13 +1415,21 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI11_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI11_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB11_5 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB11_6 @@ -1076,6 +1438,7 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB11_3: // %else4 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB11_8 ; NONEON-NOSVE-NEXT: .LBB11_4: // %else6 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB11_5: // %cond.store ; NONEON-NOSVE-NEXT: str xzr, [x0] @@ -1088,6 +1451,7 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB11_4 ; NONEON-NOSVE-NEXT: .LBB11_8: // %cond.store5 ; NONEON-NOSVE-NEXT: str xzr, [x0, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll index 6a6b47e815ac16..dbdf5f25029998 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll @@ -18,11 +18,22 @@ define void @add_v4i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: ldr s1, [x1] -; NONEON-NOSVE-NEXT: uaddl v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: uzp1 v0.8b, v0.8b, v0.8b -; NONEON-NOSVE-NEXT: str s0, [x0] +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #3] +; NONEON-NOSVE-NEXT: ldrb w9, [x1, #3] +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: ldrb w11, [x0, #1] +; NONEON-NOSVE-NEXT: ldrb w12, [x1, #2] +; NONEON-NOSVE-NEXT: ldrb w13, [x0] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w14, [x1, #1] +; NONEON-NOSVE-NEXT: ldrb w9, [x1] +; NONEON-NOSVE-NEXT: add w10, w10, w12 +; NONEON-NOSVE-NEXT: strb w8, [x0, #3] +; NONEON-NOSVE-NEXT: add w8, w11, w14 +; NONEON-NOSVE-NEXT: add w9, w13, w9 +; NONEON-NOSVE-NEXT: strb w10, [x0, #2] +; NONEON-NOSVE-NEXT: strb w8, [x0, #1] +; NONEON-NOSVE-NEXT: strb w9, [x0] ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i8>, ptr %a %op2 = load <4 x i8>, ptr %b @@ -42,10 +53,46 @@ define void @add_v8i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: add v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b @@ -65,10 +112,77 @@ define void @add_v16i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i8>, ptr %a %op2 = load <16 x i8>, ptr %b @@ -89,11 +203,143 @@ define void @add_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -116,17 +362,12 @@ define void @add_v2i16(ptr %a, ptr %b, ptr %c) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldrh w8, [x0] ; NONEON-NOSVE-NEXT: ldrh w9, [x1] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: fmov s1, w9 -; NONEON-NOSVE-NEXT: add x8, x0, #2 -; NONEON-NOSVE-NEXT: add x9, x1, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] -; NONEON-NOSVE-NEXT: ld1 { v1.h }[2], [x9] -; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: mov w8, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: strh w9, [x0] -; NONEON-NOSVE-NEXT: strh w8, [x0, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [x0, #2] +; NONEON-NOSVE-NEXT: ldrh w11, [x1, #2] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: strh w8, [x0] +; NONEON-NOSVE-NEXT: strh w9, [x0, #2] ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i16>, ptr %a %op2 = load <2 x i16>, ptr %b @@ -146,10 +387,30 @@ define void @add_v4i16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: add_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: add v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %op2 = load <4 x i16>, ptr %b @@ -169,10 +430,45 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: add_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %op2 = load <8 x i16>, ptr %b @@ -193,11 +489,79 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: add_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -218,8 +582,18 @@ define void @abs_v2i32(ptr %a) { ; NONEON-NOSVE-LABEL: abs_v2i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: abs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i32>, ptr %a %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) @@ -239,8 +613,25 @@ define void @abs_v4i32(ptr %a) { ; NONEON-NOSVE-LABEL: abs_v4i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) @@ -260,10 +651,40 @@ define void @abs_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: abs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) @@ -283,8 +704,18 @@ define void @abs_v2i64(ptr %a) { ; NONEON-NOSVE-LABEL: abs_v2i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) @@ -304,10 +735,26 @@ define void @abs_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: abs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) @@ -328,13 +775,32 @@ define void @fadd_v2f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: ldr s1, [x1] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: str s0, [x0] +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr w8, [x1] +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: str d0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: str w8, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x half>, ptr %a %op2 = load <2 x half>, ptr %b @@ -355,13 +821,42 @@ define void @fadd_v4f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %op2 = load <4 x half>, ptr %b @@ -382,17 +877,69 @@ define void @fadd_v8f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fadd v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %op2 = load <8 x half>, ptr %b @@ -415,25 +962,127 @@ define void @fadd_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fadd v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fadd v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -454,10 +1103,20 @@ define void @fadd_v2f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: fadd v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x float>, ptr %a %op2 = load <2 x float>, ptr %b @@ -478,10 +1137,25 @@ define void @fadd_v4f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %op2 = load <4 x float>, ptr %b @@ -504,11 +1178,39 @@ define void @fadd_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -529,10 +1231,19 @@ define void @fadd_v2f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fadd v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x double>, ptr %a %op2 = load <2 x double>, ptr %b @@ -555,11 +1266,27 @@ define void @fadd_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fadd v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll index 03bb899c517b4e..8c23f5f9922da7 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll @@ -19,10 +19,70 @@ define void @test_revbv16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revbv16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev16 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> @@ -43,10 +103,70 @@ define void @test_revbv8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revbv8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev32 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> @@ -67,10 +187,70 @@ define void @test_revbv4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revbv4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev64 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> @@ -91,10 +271,34 @@ define void @test_revhv8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revhv8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.8h, v0.8h -; NONEON-NOSVE-NEXT: rev32 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> @@ -115,10 +319,34 @@ define void @test_revhv8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revhv8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.8h, v0.8h -; NONEON-NOSVE-NEXT: rev32 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x half>, ptr %a %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> @@ -139,10 +367,34 @@ define void @test_revhv4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revhv4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.8h, v0.8h -; NONEON-NOSVE-NEXT: rev64 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> @@ -163,10 +415,22 @@ define void @test_revwv4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revwv4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s -; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -187,10 +451,22 @@ define void @test_revwv4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revwv4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s -; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x float>, ptr %a %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> @@ -210,7 +486,42 @@ define <16 x i8> @test_revv16i8(ptr %a) { ; NONEON-NOSVE-LABEL: test_revv16i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i8>, ptr %a %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> @@ -230,10 +541,22 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: test_revwv8i32v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s -; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = load <8 x i32>, ptr %b @@ -258,14 +581,58 @@ define void @test_revhv32i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revhv32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.8h, v0.8h -; NONEON-NOSVE-NEXT: rev64 v1.8h, v1.8h -; NONEON-NOSVE-NEXT: rev64 v2.8h, v2.8h -; NONEON-NOSVE-NEXT: rev64 v3.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: str q1, [sp, #96] +; NONEON-NOSVE-NEXT: str q3, [sp, #64] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldr q2, [sp, #48] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldr q3, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #70] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldur w8, [sp, #74] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stur w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #80] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: stp q3, q2, [x0] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i16>, ptr %a %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> @@ -285,10 +652,18 @@ define void @test_rev_elts_fail(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_rev_elts_fail: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> @@ -358,12 +733,23 @@ define void @test_revv8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revv8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s -; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index f254a1f9098f2d..bc6fdd1ecd5a71 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -72,14 +72,82 @@ define void @zip1_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip1_v32i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: zip2 v2.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <32 x i8>, ptr %a %tmp2 = load volatile <32 x i8>, ptr %b @@ -212,24 +280,149 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q4, q0, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q5, q1, [x0] -; NONEON-NOSVE-NEXT: ldp q6, q2, [x1, #32] -; NONEON-NOSVE-NEXT: ldp q7, q3, [x1] -; NONEON-NOSVE-NEXT: zip1 v17.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: zip2 v0.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: zip1 v16.8h, v1.8h, v3.8h -; NONEON-NOSVE-NEXT: zip2 v1.8h, v1.8h, v3.8h -; NONEON-NOSVE-NEXT: zip1 v2.8h, v5.8h, v7.8h -; NONEON-NOSVE-NEXT: zip1 v3.8h, v4.8h, v6.8h -; NONEON-NOSVE-NEXT: zip2 v5.8h, v5.8h, v7.8h -; NONEON-NOSVE-NEXT: zip2 v4.8h, v4.8h, v6.8h -; NONEON-NOSVE-NEXT: add v6.8h, v16.8h, v17.8h -; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: add v2.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: stp q6, q0, [x0, #32] -; NONEON-NOSVE-NEXT: stp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 192 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x1] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #32] +; NONEON-NOSVE-NEXT: stp q3, q5, [sp] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: stp q6, q2, [sp, #32] +; NONEON-NOSVE-NEXT: stp q7, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q1, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #126] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #190] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #188] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #124] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #186] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #184] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #122] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #182] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #180] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #120] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #178] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #176] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #118] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #116] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #114] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #66] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #112] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #160] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #62] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #92] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #60] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #104] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #88] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #86] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #84] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #82] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #50] +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #80] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #32] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q3, q2, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i16>, ptr %a %tmp2 = load <32 x i16>, ptr %b @@ -282,14 +475,50 @@ define void @zip1_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip1_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: zip2 v2.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: zip1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <16 x i16>, ptr %a %tmp2 = load volatile <16 x i16>, ptr %b @@ -326,14 +555,26 @@ define void @zip1_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip1_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: zip2 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #56] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = load volatile <8 x i32>, ptr %b @@ -360,15 +601,28 @@ define void @zip_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.2d, v1.2d, v3.2d -; NONEON-NOSVE-NEXT: zip1 v5.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: zip2 v1.2d, v1.2d, v3.2d -; NONEON-NOSVE-NEXT: zip2 v0.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: fadd v2.2d, v4.2d, v5.2d -; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q2, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d3, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fadd d0, d3, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b @@ -405,12 +659,29 @@ define void @zip_v4i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: zip1 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: zip2 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i32>, ptr %a %tmp2 = load <4 x i32>, ptr %b @@ -436,12 +707,22 @@ define void @zip1_v8i32_undef(ptr %a) { ; ; NONEON-NOSVE-LABEL: zip1_v8i32_undef: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: zip2 v1.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w8, w10, [sp, #24] +; NONEON-NOSVE-NEXT: stp w9, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w10, w10, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -465,15 +746,131 @@ define void @trn_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: trn1 v4.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: trn2 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: trn1 v1.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: trn2 v2.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: add v0.16b, v4.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #62] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #60] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #58] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #54] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #52] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #50] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = load <32 x i8>, ptr %b @@ -500,15 +897,32 @@ define void @trn_v8i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 -; NONEON-NOSVE-NEXT: adrp x9, .LCPI8_1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI8_0] -; NONEON-NOSVE-NEXT: ldr q2, [x9, :lo12:.LCPI8_1] -; NONEON-NOSVE-NEXT: tbl v0.16b, { v1.16b }, v0.16b -; NONEON-NOSVE-NEXT: tbl v1.16b, { v1.16b }, v2.16b -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #4] +; NONEON-NOSVE-NEXT: add w10, w9, w8 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #8] +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: add w11, w10, w11 +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: strh w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: add w11, w12, w11 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: strh w11, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i16>, ptr %a %tmp2 = load <8 x i16>, ptr %b @@ -535,15 +949,79 @@ define void @trn_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: trn1 v4.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: trn2 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: trn1 v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: trn2 v2.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: add v0.8h, v4.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v2.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = load <16 x i16>, ptr %b @@ -570,15 +1048,25 @@ define void @trn_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: trn2 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: trn1 v1.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: trn2 v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: add v0.4s, v4.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v2.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x1, #16] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #48] +; NONEON-NOSVE-NEXT: str q2, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #32] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = load <8 x i32>, ptr %b @@ -606,15 +1094,25 @@ define void @trn_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: zip2 v0.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: zip1 v1.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: zip2 v2.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: fadd v0.2d, v4.2d, v0.2d -; NONEON-NOSVE-NEXT: fadd v1.2d, v1.2d, v2.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b @@ -639,12 +1137,23 @@ define void @trn_v4f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: trn1 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: trn2 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x float>, ptr %a %tmp2 = load <4 x float>, ptr %b @@ -670,14 +1179,24 @@ define void @trn_v8i32_undef(ptr %a) { ; ; NONEON-NOSVE-LABEL: trn_v8i32_undef: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: trn1 v2.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: trn2 v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: trn1 v3.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: trn2 v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -753,14 +1272,82 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: zip2_v32i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: zip2 v2.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <32 x i8>, ptr %a %tmp2 = load volatile <32 x i8>, ptr %b @@ -811,14 +1398,50 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: zip2_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: zip2 v2.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: zip1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <16 x i16>, ptr %a %tmp2 = load volatile <16 x i16>, ptr %b @@ -855,14 +1478,26 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: zip2_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: zip2 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #56] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = load volatile <8 x i32>, ptr %b @@ -886,12 +1521,22 @@ define void @zip2_v8i32_undef(ptr %a) #0{ ; ; NONEON-NOSVE-LABEL: zip2_v8i32_undef: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: zip2 v1.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w8, w10, [sp, #24] +; NONEON-NOSVE-NEXT: stp w9, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w10, w10, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -1097,15 +1742,131 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: uzp1 v4.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: uzp2 v2.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: add v0.16b, v4.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #62] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #60] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #58] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #54] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #52] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #50] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = load <32 x i8>, ptr %b @@ -1133,12 +1894,21 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{ ; NONEON-NOSVE-LABEL: uzp_v4i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ext v1.8b, v0.8b, v0.8b, #6 -; NONEON-NOSVE-NEXT: ext v2.8b, v0.8b, v0.8b, #2 -; NONEON-NOSVE-NEXT: trn1 v1.4h, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: zip1 v0.4h, v2.4h, v0.4h -; NONEON-NOSVE-NEXT: add v0.4h, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: add w9, w9, w8 +; NONEON-NOSVE-NEXT: strh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i16>, ptr %a %tmp2 = load <4 x i16>, ptr %b @@ -1260,15 +2030,79 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: uzp1 v4.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp2 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: add v0.8h, v4.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v2.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = load <16 x i16>, ptr %b @@ -1312,15 +2146,31 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp2 v2.4s, v3.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v4.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v1.4s, v2.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: mov x8, #9205357640488583168 // =0x7fc000007fc00000 +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: str x8, [sp, #56] +; NONEON-NOSVE-NEXT: mov w8, #2143289344 // =0x7fc00000 +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: str s0, [sp, #52] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: fadd s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #32] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #72] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: str s0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x float>, ptr %a %tmp2 = load <8 x float>, ptr %b @@ -1347,15 +2197,27 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: zip2 v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: zip1 v1.2d, v3.2d, v2.2d -; NONEON-NOSVE-NEXT: zip2 v2.2d, v3.2d, v2.2d -; NONEON-NOSVE-NEXT: add v0.2d, v4.2d, v0.2d -; NONEON-NOSVE-NEXT: add v1.2d, v1.2d, v2.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #48] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = load <4 x i64>, ptr %b @@ -1427,12 +2289,45 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: add v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i16>, ptr %a %tmp2 = load <8 x i16>, ptr %b @@ -1476,10 +2371,23 @@ define void @uzp_v8i32_undef(ptr %a) #0{ ; NONEON-NOSVE-LABEL: uzp_v8i32_undef: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -1507,15 +2415,28 @@ define void @zip_vscale2_4(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip_vscale2_4: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.2d, v1.2d, v3.2d -; NONEON-NOSVE-NEXT: zip1 v5.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: zip2 v1.2d, v1.2d, v3.2d -; NONEON-NOSVE-NEXT: zip2 v0.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: fadd v2.2d, v4.2d, v5.2d -; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q2, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d3, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fadd d0, d3, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll index 41d2cb8a2c7564..8ebf713a671f49 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -39,19 +39,76 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ptest_v16i1: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: umaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #255 // =0xff +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: stp q1, q2, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w9, w8, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: csetm w10, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csetm w11, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w10, w11, w10, hi +; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: cmp w10, w9 +; NONEON-NOSVE-NEXT: csel w9, w10, w9, hi +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csel w8, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi ; NONEON-NOSVE-NEXT: and w0, w8, #0x1 +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %v0 = bitcast ptr %a to ptr %v1 = load <16 x float>, ptr %v0, align 4 @@ -113,29 +170,144 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ptest_or_v16i1: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x1, #32] -; NONEON-NOSVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 -; NONEON-NOSVE-NEXT: ldp q6, q7, [x1] -; NONEON-NOSVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: fcmeq v7.4s, v7.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v6.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: orn v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: umaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #48] +; NONEON-NOSVE-NEXT: str q2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #52] +; NONEON-NOSVE-NEXT: ldr q0, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp s2, s0, [sp, #96] +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldr s2, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w9, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: csinv w9, w9, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w10, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csinv w10, w10, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csetm w11, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s2, s0, [sp] +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: csetm w12, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csinv w12, w12, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csinv w14, w13, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #56] +; NONEON-NOSVE-NEXT: orr w12, w14, w12 +; NONEON-NOSVE-NEXT: orr w10, w12, w10 +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: orr w9, w10, w9 +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr q0, [x1, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: csinv w13, w13, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr s1, [sp, #64] +; NONEON-NOSVE-NEXT: csetm w15, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #112] +; NONEON-NOSVE-NEXT: csinv w15, w15, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: and w11, w15, #0xff +; NONEON-NOSVE-NEXT: csetm w16, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #68] +; NONEON-NOSVE-NEXT: csinv w16, w16, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csetm w17, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #120] +; NONEON-NOSVE-NEXT: csinv w17, w17, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #76] +; NONEON-NOSVE-NEXT: csetm w18, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr q1, [x1, #48] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: csinv w18, w18, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldr s2, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w0, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: csinv w0, w0, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csetm w1, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csinv w1, w1, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w2, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #88] +; NONEON-NOSVE-NEXT: csinv w2, w2, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w3, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csinv w3, w3, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csetm w4, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csinv w10, w4, wzr, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: and w9, w13, #0xff +; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: and w8, w8, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: and w9, w16, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, hi +; NONEON-NOSVE-NEXT: and w11, w17, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: and w9, w18, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, hi +; NONEON-NOSVE-NEXT: and w11, w0, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: and w9, w1, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, hi +; NONEON-NOSVE-NEXT: and w11, w2, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: and w9, w3, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi ; NONEON-NOSVE-NEXT: and w0, w8, #0x1 +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %v0 = bitcast ptr %a to ptr %v1 = load <16 x float>, ptr %v0, align 4 @@ -207,29 +379,144 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ptest_and_v16i1: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x1, #32] -; NONEON-NOSVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 -; NONEON-NOSVE-NEXT: ldp q6, q7, [x1] -; NONEON-NOSVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: fcmeq v7.4s, v7.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v6.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: uminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #48] +; NONEON-NOSVE-NEXT: str q2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #52] +; NONEON-NOSVE-NEXT: ldr q0, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp s2, s0, [sp, #96] +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w8, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldr s2, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w9, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: csel w9, w9, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w10, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w10, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csetm w11, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: csel w11, w11, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s2, s0, [sp] +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: csetm w12, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w12, w12, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csel w14, w13, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #56] +; NONEON-NOSVE-NEXT: and w12, w14, w12 +; NONEON-NOSVE-NEXT: and w10, w12, w10 +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: and w9, w10, w9 +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr q0, [x1, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: csel w13, w13, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr s1, [sp, #64] +; NONEON-NOSVE-NEXT: csetm w15, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #112] +; NONEON-NOSVE-NEXT: csel w15, w15, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: and w11, w15, #0xff +; NONEON-NOSVE-NEXT: csetm w16, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #68] +; NONEON-NOSVE-NEXT: csel w16, w16, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csetm w17, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #120] +; NONEON-NOSVE-NEXT: csel w17, w17, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #76] +; NONEON-NOSVE-NEXT: csetm w18, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr q1, [x1, #48] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: csel w18, w18, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldr s2, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w0, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: csel w0, w0, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csetm w1, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csel w1, w1, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w2, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #88] +; NONEON-NOSVE-NEXT: csel w2, w2, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w3, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csel w3, w3, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csetm w4, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w4, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: and w9, w13, #0xff +; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: and w8, w8, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: and w9, w16, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, lo +; NONEON-NOSVE-NEXT: and w11, w17, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: and w9, w18, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, lo +; NONEON-NOSVE-NEXT: and w11, w0, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: and w9, w1, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, lo +; NONEON-NOSVE-NEXT: and w11, w2, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: and w9, w3, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo ; NONEON-NOSVE-NEXT: and w0, w8, #0x1 +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %v0 = bitcast ptr %a to ptr %v1 = load <16 x float>, ptr %v0, align 4 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll index 5626f77c684f22..bc0fc7c79391d1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll @@ -22,9 +22,26 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b -; NONEON-NOSVE-NEXT: ushr v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %op) ret <4 x i8> %res @@ -41,7 +58,42 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op) ret <8 x i8> %res @@ -58,7 +110,74 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op) ret <16 x i8> %res @@ -76,10 +195,140 @@ define void @bitreverse_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: bitreverse_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op) @@ -99,9 +348,17 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b -; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -118,8 +375,26 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -136,8 +411,42 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -155,12 +464,76 @@ define void @bitreverse_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: bitreverse_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev16 v1.16b, v1.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op) @@ -179,8 +552,15 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -197,8 +577,20 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -216,12 +608,32 @@ define void @bitreverse_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: bitreverse_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev32 v1.16b, v1.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op) @@ -240,8 +652,13 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -258,8 +675,15 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -277,12 +701,22 @@ define void @bitreverse_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: bitreverse_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev64 v1.16b, v1.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op) @@ -306,8 +740,31 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -324,7 +781,26 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -341,7 +817,42 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -359,10 +870,79 @@ define void @bswap_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: bswap_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev16 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op) @@ -381,7 +961,26 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -398,7 +997,42 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -416,10 +1050,79 @@ define void @bswap_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: bswap_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev32 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op) @@ -438,7 +1141,26 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -455,7 +1177,42 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -473,10 +1230,79 @@ define void @bswap_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: bswap_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev64 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll index 55f4f5bae641e5..df019ce2e0ad67 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll @@ -18,15 +18,38 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v1.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: ushr v1.4h, v1.4h, #7 -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: usra v0.4h, v1.4h, #3 -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w12, [sp] +; NONEON-NOSVE-NEXT: sxtb w11, w8 +; NONEON-NOSVE-NEXT: sxtb w13, w9 +; NONEON-NOSVE-NEXT: sxtb w14, w10 +; NONEON-NOSVE-NEXT: sxtb w15, w12 +; NONEON-NOSVE-NEXT: ubfx w11, w11, #10, #5 +; NONEON-NOSVE-NEXT: ubfx w13, w13, #10, #5 +; NONEON-NOSVE-NEXT: ubfx w14, w14, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w11 +; NONEON-NOSVE-NEXT: ubfx w11, w15, #10, #5 +; NONEON-NOSVE-NEXT: add w9, w9, w13 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: add w10, w10, w14 +; NONEON-NOSVE-NEXT: sxtb w9, w9 +; NONEON-NOSVE-NEXT: add w11, w12, w11 +; NONEON-NOSVE-NEXT: sxtb w10, w10 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: sxtb w11, w11 +; NONEON-NOSVE-NEXT: lsr w9, w9, #5 +; NONEON-NOSVE-NEXT: lsr w10, w10, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w11, #5 +; NONEON-NOSVE-NEXT: strh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w10, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i8> %op1, shufflevector (<4 x i8> insertelement (<4 x i8> poison, i8 32, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer) ret <4 x i8> %res @@ -43,9 +66,58 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: usra v0.8b, v1.8b, #3 -; NONEON-NOSVE-NEXT: sshr v0.8b, v0.8b, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer) ret <8 x i8> %res @@ -62,9 +134,106 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: usra v0.16b, v1.16b, #3 -; NONEON-NOSVE-NEXT: sshr v0.16b, v0.16b, #5 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer) ret <16 x i8> %res @@ -82,14 +251,204 @@ define void @sdiv_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: sdiv_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v2.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: cmlt v3.16b, v1.16b, #0 -; NONEON-NOSVE-NEXT: usra v0.16b, v2.16b, #3 -; NONEON-NOSVE-NEXT: usra v1.16b, v3.16b, #3 -; NONEON-NOSVE-NEXT: sshr v0.16b, v0.16b, #5 -; NONEON-NOSVE-NEXT: sshr v1.16b, v1.16b, #5 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer) @@ -109,16 +468,20 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v1.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: ushr v1.2s, v1.2s, #26 -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: sxth w10, w8 +; NONEON-NOSVE-NEXT: sxth w11, w9 +; NONEON-NOSVE-NEXT: ubfx w10, w10, #26, #5 +; NONEON-NOSVE-NEXT: ubfx w11, w11, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: add w9, w9, w11 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #5, #11 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #5, #11 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i16> %op1, shufflevector (<2 x i16> insertelement (<2 x i16> poison, i16 32, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer) ret <2 x i16> %res @@ -135,9 +498,34 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: usra v0.4h, v1.4h, #11 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer) ret <4 x i16> %res @@ -154,9 +542,58 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.8h, v0.8h, #0 -; NONEON-NOSVE-NEXT: usra v0.8h, v1.8h, #11 -; NONEON-NOSVE-NEXT: sshr v0.8h, v0.8h, #5 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer) ret <8 x i16> %res @@ -174,14 +611,108 @@ define void @sdiv_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: sdiv_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v2.8h, v0.8h, #0 -; NONEON-NOSVE-NEXT: cmlt v3.8h, v1.8h, #0 -; NONEON-NOSVE-NEXT: usra v0.8h, v2.8h, #11 -; NONEON-NOSVE-NEXT: usra v1.8h, v3.8h, #11 -; NONEON-NOSVE-NEXT: sshr v0.8h, v0.8h, #5 -; NONEON-NOSVE-NEXT: sshr v1.8h, v1.8h, #5 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer) @@ -200,9 +731,19 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: usra v0.2s, v1.2s, #27 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer) ret <2 x i32> %res @@ -219,9 +760,28 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.4s, v0.4s, #0 -; NONEON-NOSVE-NEXT: usra v0.4s, v1.4s, #27 -; NONEON-NOSVE-NEXT: sshr v0.4s, v0.4s, #5 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer) ret <4 x i32> %res @@ -239,14 +799,48 @@ define void @sdiv_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v2.4s, v0.4s, #0 -; NONEON-NOSVE-NEXT: cmlt v3.4s, v1.4s, #0 -; NONEON-NOSVE-NEXT: usra v0.4s, v2.4s, #27 -; NONEON-NOSVE-NEXT: usra v1.4s, v3.4s, #27 -; NONEON-NOSVE-NEXT: sshr v0.4s, v0.4s, #5 -; NONEON-NOSVE-NEXT: sshr v1.4s, v1.4s, #5 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer) @@ -265,9 +859,15 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt d1, d0, #0 -; NONEON-NOSVE-NEXT: usra d0, d1, #59 -; NONEON-NOSVE-NEXT: sshr d0, d0, #5 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x8, x8, #5 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer) ret <1 x i64> %res @@ -285,9 +885,19 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.2d, v0.2d, #0 -; NONEON-NOSVE-NEXT: usra v0.2d, v1.2d, #59 -; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #5 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x10, x8, #5 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x8, x8, #5 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer) ret <2 x i64> %res @@ -305,14 +915,30 @@ define void @sdiv_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v2.2d, v0.2d, #0 -; NONEON-NOSVE-NEXT: cmlt v3.2d, v1.2d, #0 -; NONEON-NOSVE-NEXT: usra v0.2d, v2.2d, #59 -; NONEON-NOSVE-NEXT: usra v1.2d, v3.2d, #59 -; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #5 -; NONEON-NOSVE-NEXT: sshr v1.2d, v1.2d, #5 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x10, x8, #5 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x8, x8, #5 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x10, x8, #5 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x8, x8, #5 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll index 38aaf860b7298c..b66e6d90135730 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll @@ -18,9 +18,15 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) { ; ; NONEON-NOSVE-LABEL: hang_when_merging_stores_after_legalisation: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4s, v0.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> @@ -39,9 +45,25 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2 ; ; NONEON-NOSVE-LABEL: interleave_store_without_splat: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip2 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %shuffle = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> %interleaved = shufflevector <8 x i32> %shuffle, <8 x i32> undef, <8 x i32> @@ -64,12 +86,40 @@ define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2) ; ; NONEON-NOSVE-LABEL: interleave_store_legalization: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip2 v4.4s, v1.4s, v3.4s -; NONEON-NOSVE-NEXT: zip1 v1.4s, v1.4s, v3.4s -; NONEON-NOSVE-NEXT: zip2 v3.4s, v0.4s, v2.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q4, [x0, #32] -; NONEON-NOSVE-NEXT: stp q0, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #100] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #108] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr q3, [sp, #112] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q2, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %interleaved.vec = shufflevector <8 x i32> %v1, <8 x i32> %v2, <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll index e15529e1926ac7..a4cf5d608fed6d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll @@ -19,7 +19,14 @@ define <4 x i8> @splat_v4i8(i8 %a) { ; ; NONEON-NOSVE-LABEL: splat_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.4h, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strh w0, [sp, #14] +; NONEON-NOSVE-NEXT: strh w0, [sp, #12] +; NONEON-NOSVE-NEXT: strh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i8> undef, i8 %a, i64 0 %splat = shufflevector <4 x i8> %insert, <4 x i8> undef, <4 x i32> zeroinitializer @@ -35,7 +42,18 @@ define <8 x i8> @splat_v8i8(i8 %a) { ; ; NONEON-NOSVE-LABEL: splat_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.8b, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strb w0, [sp, #15] +; NONEON-NOSVE-NEXT: strb w0, [sp, #14] +; NONEON-NOSVE-NEXT: strb w0, [sp, #13] +; NONEON-NOSVE-NEXT: strb w0, [sp, #12] +; NONEON-NOSVE-NEXT: strb w0, [sp, #11] +; NONEON-NOSVE-NEXT: strb w0, [sp, #10] +; NONEON-NOSVE-NEXT: strb w0, [sp, #9] +; NONEON-NOSVE-NEXT: strb w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i8> undef, i8 %a, i64 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer @@ -51,7 +69,25 @@ define <16 x i8> @splat_v16i8(i8 %a) { ; ; NONEON-NOSVE-LABEL: splat_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.16b, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strb w0, [sp, #15] +; NONEON-NOSVE-NEXT: strb w0, [sp, #14] +; NONEON-NOSVE-NEXT: strb w0, [sp, #13] +; NONEON-NOSVE-NEXT: strb w0, [sp, #12] +; NONEON-NOSVE-NEXT: strb w0, [sp, #11] +; NONEON-NOSVE-NEXT: strb w0, [sp, #10] +; NONEON-NOSVE-NEXT: strb w0, [sp, #9] +; NONEON-NOSVE-NEXT: strb w0, [sp, #8] +; NONEON-NOSVE-NEXT: strb w0, [sp, #7] +; NONEON-NOSVE-NEXT: strb w0, [sp, #6] +; NONEON-NOSVE-NEXT: strb w0, [sp, #5] +; NONEON-NOSVE-NEXT: strb w0, [sp, #4] +; NONEON-NOSVE-NEXT: strb w0, [sp, #3] +; NONEON-NOSVE-NEXT: strb w0, [sp, #2] +; NONEON-NOSVE-NEXT: strb w0, [sp, #1] +; NONEON-NOSVE-NEXT: strb w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x i8> undef, i8 %a, i64 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer @@ -67,8 +103,27 @@ define void @splat_v32i8(i8 %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.16b, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strb w0, [sp, #15] +; NONEON-NOSVE-NEXT: strb w0, [sp, #14] +; NONEON-NOSVE-NEXT: strb w0, [sp, #13] +; NONEON-NOSVE-NEXT: strb w0, [sp, #12] +; NONEON-NOSVE-NEXT: strb w0, [sp, #11] +; NONEON-NOSVE-NEXT: strb w0, [sp, #10] +; NONEON-NOSVE-NEXT: strb w0, [sp, #9] +; NONEON-NOSVE-NEXT: strb w0, [sp, #8] +; NONEON-NOSVE-NEXT: strb w0, [sp, #7] +; NONEON-NOSVE-NEXT: strb w0, [sp, #6] +; NONEON-NOSVE-NEXT: strb w0, [sp, #5] +; NONEON-NOSVE-NEXT: strb w0, [sp, #4] +; NONEON-NOSVE-NEXT: strb w0, [sp, #3] +; NONEON-NOSVE-NEXT: strb w0, [sp, #2] +; NONEON-NOSVE-NEXT: strb w0, [sp, #1] +; NONEON-NOSVE-NEXT: strb w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <32 x i8> undef, i8 %a, i64 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer @@ -85,7 +140,11 @@ define <2 x i16> @splat_v2i16(i16 %a) { ; ; NONEON-NOSVE-LABEL: splat_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.2s, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x i16> undef, i16 %a, i64 0 %splat = shufflevector <2 x i16> %insert, <2 x i16> undef, <2 x i32> zeroinitializer @@ -101,7 +160,14 @@ define <4 x i16> @splat_v4i16(i16 %a) { ; ; NONEON-NOSVE-LABEL: splat_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.4h, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strh w0, [sp, #14] +; NONEON-NOSVE-NEXT: strh w0, [sp, #12] +; NONEON-NOSVE-NEXT: strh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i16> undef, i16 %a, i64 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer @@ -117,7 +183,17 @@ define <8 x i16> @splat_v8i16(i16 %a) { ; ; NONEON-NOSVE-LABEL: splat_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.8h, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strh w0, [sp, #14] +; NONEON-NOSVE-NEXT: strh w0, [sp, #12] +; NONEON-NOSVE-NEXT: strh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w0, [sp, #6] +; NONEON-NOSVE-NEXT: strh w0, [sp, #4] +; NONEON-NOSVE-NEXT: strh w0, [sp, #2] +; NONEON-NOSVE-NEXT: strh w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i16> undef, i16 %a, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer @@ -133,8 +209,19 @@ define void @splat_v16i16(i16 %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.8h, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strh w0, [sp, #14] +; NONEON-NOSVE-NEXT: strh w0, [sp, #12] +; NONEON-NOSVE-NEXT: strh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w0, [sp, #6] +; NONEON-NOSVE-NEXT: strh w0, [sp, #4] +; NONEON-NOSVE-NEXT: strh w0, [sp, #2] +; NONEON-NOSVE-NEXT: strh w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x i16> undef, i16 %a, i64 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer @@ -151,7 +238,11 @@ define <2 x i32> @splat_v2i32(i32 %a) { ; ; NONEON-NOSVE-LABEL: splat_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.2s, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x i32> undef, i32 %a, i64 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer @@ -167,7 +258,11 @@ define <4 x i32> @splat_v4i32(i32 %a) { ; ; NONEON-NOSVE-LABEL: splat_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.4s, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w0, [sp, #8] +; NONEON-NOSVE-NEXT: stp w0, w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i32> undef, i32 %a, i64 0 %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer @@ -183,8 +278,13 @@ define void @splat_v8i32(i32 %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.4s, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w0, [sp, #8] +; NONEON-NOSVE-NEXT: stp w0, w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i32> undef, i32 %a, i64 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer @@ -201,7 +301,11 @@ define <1 x i64> @splat_v1i64(i64 %a) { ; ; NONEON-NOSVE-LABEL: splat_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov d0, x0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str x0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <1 x i64> undef, i64 %a, i64 0 %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer @@ -217,7 +321,9 @@ define <2 x i64> @splat_v2i64(i64 %a) { ; ; NONEON-NOSVE-LABEL: splat_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.2d, x0 +; NONEON-NOSVE-NEXT: stp x0, x0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x i64> undef, i64 %a, i64 0 %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer @@ -233,8 +339,11 @@ define void @splat_v4i64(i64 %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.2d, x0 +; NONEON-NOSVE-NEXT: stp x0, x0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i64> undef, i64 %a, i64 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer @@ -256,8 +365,12 @@ define <2 x half> @splat_v2f16(half %a) { ; ; NONEON-NOSVE-LABEL: splat_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4h, v0.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x half> undef, half %a, i64 0 %splat = shufflevector <2 x half> %insert, <2 x half> undef, <2 x i32> zeroinitializer @@ -274,8 +387,14 @@ define <4 x half> @splat_v4f16(half %a) { ; ; NONEON-NOSVE-LABEL: splat_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4h, v0.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x half> undef, half %a, i64 0 %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer @@ -292,8 +411,17 @@ define <8 x half> @splat_v8f16(half %a) { ; ; NONEON-NOSVE-LABEL: splat_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.8h, v0.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #6] +; NONEON-NOSVE-NEXT: str h0, [sp, #4] +; NONEON-NOSVE-NEXT: str h0, [sp, #2] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x half> undef, half %a, i64 0 %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer @@ -310,9 +438,19 @@ define void @splat_v16f16(half %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.8h, v0.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #6] +; NONEON-NOSVE-NEXT: str h0, [sp, #4] +; NONEON-NOSVE-NEXT: str h0, [sp, #2] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x half> undef, half %a, i64 0 %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer @@ -330,8 +468,11 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: splat_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $s0 killed $s0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s0, s0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x float> undef, float %a, i64 0 %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer @@ -348,8 +489,11 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: splat_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $s0 killed $s0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4s, v0.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s0, s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x float> undef, float %a, i64 0 %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer @@ -366,9 +510,13 @@ define void @splat_v8f32(float %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $s0 killed $s0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4s, v0.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s0, s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x float> undef, float %a, i64 0 %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer @@ -383,6 +531,11 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: splat_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <1 x double> undef, double %a, i64 0 %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer @@ -399,8 +552,9 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: splat_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2d, v0.d[0] +; NONEON-NOSVE-NEXT: stp d0, d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x double> undef, double %a, i64 0 %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer @@ -417,9 +571,11 @@ define void @splat_v4f64(double %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2d, v0.d[0] +; NONEON-NOSVE-NEXT: stp d0, d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x double> undef, double %a, i64 0 %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer @@ -440,7 +596,8 @@ define void @splat_imm_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #1 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI24_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI24_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <32 x i8> undef, i8 1, i64 0 @@ -458,8 +615,8 @@ define void @splat_imm_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #2 // =0x2 -; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI25_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI25_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x i16> undef, i16 2, i64 0 @@ -477,8 +634,8 @@ define void @splat_imm_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #3 // =0x3 -; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI26_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI26_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i32> undef, i32 3, i64 0 @@ -496,8 +653,8 @@ define void @splat_imm_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #4 // =0x4 -; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI27_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI27_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i64> undef, i64 4, i64 0 @@ -519,8 +676,8 @@ define void @splat_imm_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #17664 // =0x4500 -; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI28_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI28_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x half> undef, half 5.0, i64 0 @@ -538,7 +695,8 @@ define void @splat_imm_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov v0.4s, #6.00000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI29_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI29_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x float> undef, float 6.0, i64 0 @@ -556,7 +714,8 @@ define void @splat_imm_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov v0.2d, #7.00000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI30_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI30_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x double> undef, double 7.0, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll index f055061b13bed6..a77ac7832e17cb 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll @@ -31,7 +31,8 @@ define void @store_v8i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI1_0 +; NONEON-NOSVE-NEXT: ldr d0, [x8, :lo12:.LCPI1_0] ; NONEON-NOSVE-NEXT: str d0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x i8> zeroinitializer, ptr %a @@ -47,7 +48,8 @@ define void @store_v16i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] ; NONEON-NOSVE-NEXT: str q0, [x0] ; NONEON-NOSVE-NEXT: ret store <16 x i8> zeroinitializer, ptr %a @@ -63,7 +65,8 @@ define void @store_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <32 x i8> zeroinitializer, ptr %a @@ -96,7 +99,14 @@ define void @store_v2f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str wzr, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI5_0 +; NONEON-NOSVE-NEXT: ldr d0, [x8, :lo12:.LCPI5_0] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret store <2 x half> zeroinitializer, ptr %a ret void @@ -111,7 +121,8 @@ define void @store_v4i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI6_0 +; NONEON-NOSVE-NEXT: ldr d0, [x8, :lo12:.LCPI6_0] ; NONEON-NOSVE-NEXT: str d0, [x0] ; NONEON-NOSVE-NEXT: ret store <4 x i16> zeroinitializer, ptr %a @@ -127,7 +138,8 @@ define void @store_v4f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI7_0 +; NONEON-NOSVE-NEXT: ldr d0, [x8, :lo12:.LCPI7_0] ; NONEON-NOSVE-NEXT: str d0, [x0] ; NONEON-NOSVE-NEXT: ret store <4 x half> zeroinitializer, ptr %a @@ -143,7 +155,8 @@ define void @store_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI8_0] ; NONEON-NOSVE-NEXT: str q0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x i16> zeroinitializer, ptr %a @@ -159,7 +172,8 @@ define void @store_v8f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI9_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI9_0] ; NONEON-NOSVE-NEXT: str q0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x half> zeroinitializer, ptr %a @@ -175,7 +189,8 @@ define void @store_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI10_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <16 x i16> zeroinitializer, ptr %a @@ -191,7 +206,8 @@ define void @store_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI11_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI11_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <16 x half> zeroinitializer, ptr %a @@ -263,7 +279,8 @@ define void @store_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI16_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI16_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x i32> zeroinitializer, ptr %a @@ -279,7 +296,8 @@ define void @store_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI17_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI17_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x float> zeroinitializer, ptr %a @@ -295,8 +313,12 @@ define void @store_v1i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str xzr, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret store <1 x i64> zeroinitializer, ptr %a ret void @@ -311,8 +333,12 @@ define void @store_v1f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v1f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str xzr, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret store <1 x double> zeroinitializer, ptr %a ret void @@ -355,7 +381,8 @@ define void @store_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI22_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI22_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <4 x i64> zeroinitializer, ptr %a @@ -371,7 +398,8 @@ define void @store_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI23_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI23_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <4 x double> zeroinitializer, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll index 80c9ef87e9b915..a9f4d92b1e6b64 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll @@ -27,8 +27,12 @@ define void @subvector_v4i8(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: subvector_v4i8: ; NONEON-NOSVE: // %bb.0: // %bb1 -; NONEON-NOSVE-NEXT: ldr w8, [x0] -; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #2] +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [x0] +; NONEON-NOSVE-NEXT: strh w8, [x1, #2] +; NONEON-NOSVE-NEXT: strb w9, [x1, #1] +; NONEON-NOSVE-NEXT: strb w10, [x1] ; NONEON-NOSVE-NEXT: ret %a = load <4 x i8>, ptr %in br label %bb1 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll index 41b68e10e75ded..30682751037fe5 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll @@ -17,8 +17,27 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) { ; NONEON-NOSVE-LABEL: store_trunc_v8i16i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i16>, ptr %ap %val = trunc <8 x i16> %a to <8 x i8> @@ -37,9 +56,15 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) { ; NONEON-NOSVE-LABEL: store_trunc_v4i32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8b, v0.8b, v0.8b -; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp] +; NONEON-NOSVE-NEXT: strb w8, [x1, #3] +; NONEON-NOSVE-NEXT: strb w9, [x1, #2] +; NONEON-NOSVE-NEXT: strb w11, [x1, #1] +; NONEON-NOSVE-NEXT: strb w10, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %ap %val = trunc <4 x i32> %a to <4 x i8> @@ -58,8 +83,17 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) { ; NONEON-NOSVE-LABEL: store_trunc_v4i32i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %ap %val = trunc <4 x i32> %a to <4 x i16> @@ -78,8 +112,13 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) { ; NONEON-NOSVE-LABEL: store_trunc_v2i64i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %ap %val = trunc <2 x i64> %a to <2 x i32> @@ -99,10 +138,15 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) { ; ; NONEON-NOSVE-LABEL: store_trunc_v2i256i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0, #32] -; NONEON-NOSVE-NEXT: ldr d1, [x0] -; NONEON-NOSVE-NEXT: mov v1.d[1], v0.d[0] -; NONEON-NOSVE-NEXT: str q1, [x1] +; NONEON-NOSVE-NEXT: ldr x8, [x0, #32] +; NONEON-NOSVE-NEXT: ldr x9, [x0] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <2 x i256>, ptr %ap %val = trunc <2 x i256> %a to <2 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll index 8242b4e26d5057..bc046059f0bd59 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll @@ -24,7 +24,41 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v16i16_v16i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = trunc <16 x i16> %a to <16 x i8> @@ -51,13 +85,125 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: sub sp, sp, #208 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #112] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w25, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w29, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w28, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w23, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #26] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w21, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #44] +; NONEON-NOSVE-NEXT: strb w9, [sp, #91] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #40] +; NONEON-NOSVE-NEXT: strb w9, [sp, #89] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #36] +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w20, [sp, #18] +; NONEON-NOSVE-NEXT: strb w9, [sp, #87] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #60] +; NONEON-NOSVE-NEXT: strb w9, [sp, #85] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #62] +; NONEON-NOSVE-NEXT: add w6, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: add w5, w13, w13 +; NONEON-NOSVE-NEXT: strb w9, [sp, #83] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #32] +; NONEON-NOSVE-NEXT: strb w9, [sp, #81] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #76] +; NONEON-NOSVE-NEXT: strb w9, [sp, #111] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #78] +; NONEON-NOSVE-NEXT: strb w8, [sp, #110] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #72] +; NONEON-NOSVE-NEXT: strb w9, [sp, #109] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #74] +; NONEON-NOSVE-NEXT: strb w8, [sp, #108] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #68] +; NONEON-NOSVE-NEXT: strb w9, [sp, #107] +; NONEON-NOSVE-NEXT: add w9, w17, w17 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #70] +; NONEON-NOSVE-NEXT: strb w8, [sp, #106] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrh w30, [sp, #58] +; NONEON-NOSVE-NEXT: strb w9, [sp, #105] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strb w8, [sp, #104] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strb w9, [sp, #103] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strb w8, [sp, #102] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w9, [sp, #101] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strb w8, [sp, #100] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strb w9, [sp, #99] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #98] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w5, [sp, #95] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w6, [sp, #94] +; NONEON-NOSVE-NEXT: strb w5, [sp, #93] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #97] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #80] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #112] // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #208 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i16>, ptr %in %b = trunc <32 x i16> %a to <32 x i8> @@ -97,20 +243,276 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v64i16_v64i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #448 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #416] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: ldp q6, q1, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: uzp1 v3.16b, v5.16b, v4.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v6.16b, v1.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v2.16b, v2.16b, v2.16b -; NONEON-NOSVE-NEXT: add v3.16b, v3.16b, v3.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1] +; NONEON-NOSVE-NEXT: str x1, [sp, #152] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #432] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #224] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #238] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #256] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #232] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #272] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp, #160] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #230] +; NONEON-NOSVE-NEXT: add w21, w8, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #274] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #192] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #228] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #226] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #224] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #276] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #278] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #270] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #268] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #266] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #280] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #282] +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #264] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #262] +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #260] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #284] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #286] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #258] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #256] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #254] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #208] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #252] +; NONEON-NOSVE-NEXT: ldrh w6, [sp, #250] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #248] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #212] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #214] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #246] +; NONEON-NOSVE-NEXT: ldrh w20, [sp, #244] +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #242] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #216] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #218] +; NONEON-NOSVE-NEXT: ldrh w23, [sp, #240] +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #174] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #220] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #222] +; NONEON-NOSVE-NEXT: ldrh w25, [sp, #172] +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #170] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #178] +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #168] +; NONEON-NOSVE-NEXT: ldrh w28, [sp, #166] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #180] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #182] +; NONEON-NOSVE-NEXT: ldrh w29, [sp, #164] +; NONEON-NOSVE-NEXT: ldrh w30, [sp, #162] +; NONEON-NOSVE-NEXT: strb w21, [sp, #335] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #186] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #188] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #190] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #192] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #194] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #196] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #198] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #200] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #202] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #204] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #206] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #236] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #234] +; NONEON-NOSVE-NEXT: strb w9, [sp, #334] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #333] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #332] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strb w8, [sp, #331] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #330] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w8, [sp, #329] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #328] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w8, [sp, #327] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #326] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w8, [sp, #325] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: strb w8, [sp, #324] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w8, [sp, #323] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #322] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #321] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w8, [sp, #320] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #319] +; NONEON-NOSVE-NEXT: add w8, w5, w5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #318] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: strb w8, [sp, #317] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #316] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: strb w8, [sp, #315] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #314] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #313] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: strb w8, [sp, #312] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #311] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: strb w8, [sp, #310] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #309] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: strb w8, [sp, #308] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strb w8, [sp, #307] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #306] +; NONEON-NOSVE-NEXT: add w8, w30, w30 +; NONEON-NOSVE-NEXT: strb w8, [sp, #305] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #432] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #303] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #302] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #301] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #299] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #298] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #297] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #295] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #294] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #293] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #291] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #290] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #289] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #288] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #351] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #350] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #349] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #347] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #346] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #345] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #343] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #342] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #341] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #339] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #338] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #337] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #148] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #152] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #320] +; NONEON-NOSVE-NEXT: stp q3, q2, [x8] +; NONEON-NOSVE-NEXT: stp q0, q1, [x8, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #448 ; NONEON-NOSVE-NEXT: ret %a = load <64 x i16>, ptr %in %b = trunc <64 x i16> %a to <64 x i8> @@ -172,34 +574,598 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #224] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #128] -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: ldp q16, q1, [x0, #160] -; NONEON-NOSVE-NEXT: uzp1 v4.16b, v5.16b, v4.16b -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.16b, v7.16b, v6.16b +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #800 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: str x1, [sp, #408] // 8-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q18, q7, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v16.16b, v1.16b -; NONEON-NOSVE-NEXT: uzp1 v5.16b, v17.16b, v5.16b -; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v4.16b, v4.16b, v4.16b -; NONEON-NOSVE-NEXT: uzp1 v7.16b, v18.16b, v7.16b -; NONEON-NOSVE-NEXT: add v3.16b, v6.16b, v6.16b -; NONEON-NOSVE-NEXT: uzp1 v6.16b, v17.16b, v16.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #96] -; NONEON-NOSVE-NEXT: add v0.16b, v5.16b, v5.16b -; NONEON-NOSVE-NEXT: add v2.16b, v2.16b, v2.16b -; NONEON-NOSVE-NEXT: add v4.16b, v7.16b, v7.16b -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #64] -; NONEON-NOSVE-NEXT: add v1.16b, v6.16b, v6.16b -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #32] -; NONEON-NOSVE-NEXT: stp q2, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #160] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #128] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #224] +; NONEON-NOSVE-NEXT: str q0, [sp, #592] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #606] +; NONEON-NOSVE-NEXT: str q19, [sp, #496] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #600] +; NONEON-NOSVE-NEXT: stp q18, q20, [sp, #512] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #598] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #596] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp q17, q23, [sp, #432] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #594] +; NONEON-NOSVE-NEXT: str w8, [sp, #64] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #432] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #592] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #464] +; NONEON-NOSVE-NEXT: ldr w30, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: str w8, [sp, #404] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #434] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #560] +; NONEON-NOSVE-NEXT: str w8, [sp, #400] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #436] +; NONEON-NOSVE-NEXT: str q5, [sp, #544] +; NONEON-NOSVE-NEXT: str w8, [sp, #396] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #438] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #608] +; NONEON-NOSVE-NEXT: str w8, [sp, #392] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #440] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #638] +; NONEON-NOSVE-NEXT: stp q7, q21, [sp, #640] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #636] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #634] +; NONEON-NOSVE-NEXT: str w8, [sp, #388] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #442] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #666] +; NONEON-NOSVE-NEXT: str q3, [sp, #416] +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #632] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #630] +; NONEON-NOSVE-NEXT: str w8, [sp, #384] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #444] +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #628] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #626] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #624] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #622] +; NONEON-NOSVE-NEXT: str w8, [sp, #380] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #446] +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #620] +; NONEON-NOSVE-NEXT: ldrh w6, [sp, #618] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #616] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #614] +; NONEON-NOSVE-NEXT: str w8, [sp, #376] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #480] +; NONEON-NOSVE-NEXT: ldrh w20, [sp, #612] +; NONEON-NOSVE-NEXT: ldrh w21, [sp, #610] +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #608] +; NONEON-NOSVE-NEXT: ldrh w23, [sp, #430] +; NONEON-NOSVE-NEXT: str w8, [sp, #372] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #482] +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #428] +; NONEON-NOSVE-NEXT: ldrh w25, [sp, #426] +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #424] +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #422] +; NONEON-NOSVE-NEXT: str w8, [sp, #368] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #484] +; NONEON-NOSVE-NEXT: ldrh w28, [sp, #420] +; NONEON-NOSVE-NEXT: ldrh w29, [sp, #418] +; NONEON-NOSVE-NEXT: strb w30, [sp, #767] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #486] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #488] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #490] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #492] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #494] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #448] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #450] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #452] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #454] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #456] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #458] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #460] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #462] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #464] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #466] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #468] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #470] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #472] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #474] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #476] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #478] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #656] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #658] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #660] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #662] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #664] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #668] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #252] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #670] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #528] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #244] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #530] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #532] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #236] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #534] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #536] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #228] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #538] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #540] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #220] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #542] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #496] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #212] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #498] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #500] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #204] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #502] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #504] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #196] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #506] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #508] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #188] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #510] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #512] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #180] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #514] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #516] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #172] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #518] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #520] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #164] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #522] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #524] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #156] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #526] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #640] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #148] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #642] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #644] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #140] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #646] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #648] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #132] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #650] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #652] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #124] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #654] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #576] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #116] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #578] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #580] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #108] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #582] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #584] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #100] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #586] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #588] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #92] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #590] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #544] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #84] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #546] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #548] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #76] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #550] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #552] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #68] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #554] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #556] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #558] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #560] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #562] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #564] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #566] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #568] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #570] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #572] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #574] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #416] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #602] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #604] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #765] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #764] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strb w8, [sp, #763] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #762] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w8, [sp, #761] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #760] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w8, [sp, #759] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #758] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w8, [sp, #757] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: strb w8, [sp, #756] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w8, [sp, #755] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #754] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #753] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w8, [sp, #752] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #751] +; NONEON-NOSVE-NEXT: add w8, w5, w5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #750] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: strb w8, [sp, #749] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #748] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: strb w8, [sp, #747] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #746] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: strb w8, [sp, #745] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #744] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: strb w8, [sp, #743] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #742] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: strb w8, [sp, #741] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #740] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: strb w8, [sp, #739] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strb w8, [sp, #738] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #737] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #766] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #736] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #736] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #735] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #734] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #733] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #732] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #731] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #730] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #729] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #728] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #727] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #726] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #725] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #724] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #723] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #722] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #721] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #720] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #783] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #782] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #781] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #780] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #779] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #778] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #777] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #776] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #775] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #774] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #773] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #772] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #771] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #770] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #769] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #148] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #768] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #719] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #156] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #718] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #717] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #164] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #716] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #168] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #715] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #172] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #714] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #713] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #180] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #712] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #711] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #188] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #710] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #709] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #196] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #708] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #200] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #707] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #204] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #706] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #208] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #705] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #212] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #704] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #216] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q6, q3, [sp, #704] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #799] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #220] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #798] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #224] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #797] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #228] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #796] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #232] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #795] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #236] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #794] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #240] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #793] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #244] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #792] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #248] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #791] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #252] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #790] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #256] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #789] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #260] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #788] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #264] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #787] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #268] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #786] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #272] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #785] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #276] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #784] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #280] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q4, q7, [sp, #768] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #687] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #284] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #686] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #288] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #685] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #292] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #684] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #296] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #683] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #300] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #682] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #304] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #681] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #308] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #680] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #312] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #679] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #316] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #678] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #320] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #677] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #324] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #676] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #328] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #675] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #332] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #674] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #336] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #673] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #340] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #672] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #344] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #703] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #348] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #702] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #352] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #701] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #356] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #700] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #360] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #699] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #364] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #698] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #368] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #697] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #372] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #696] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #376] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #695] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #380] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #694] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #384] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #693] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #388] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #692] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #392] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #691] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #396] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #690] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #400] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #689] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #404] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #688] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #408] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #672] +; NONEON-NOSVE-NEXT: stp q1, q0, [x8] +; NONEON-NOSVE-NEXT: stp q4, q3, [x8, #32] +; NONEON-NOSVE-NEXT: stp q7, q6, [x8, #64] +; NONEON-NOSVE-NEXT: stp q2, q5, [x8, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #800 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <128 x i16>, ptr %in %b = trunc <128 x i16> %a to <128 x i8> @@ -227,8 +1193,21 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v8i32_v8i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = trunc <8 x i32> %a to <8 x i8> @@ -256,11 +1235,38 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i32_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i32>, ptr %in %b = trunc <16 x i32> %a to <16 x i8> @@ -302,19 +1308,113 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i32_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #192] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v7.8h, v6.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #80] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp w27, w28, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w25, w26, [sp, #104] +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w10, w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp w23, w24, [sp, #96] +; NONEON-NOSVE-NEXT: ldp w21, w22, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #120] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w19, w20, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: strb w8, [sp, #155] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strb w9, [sp, #154] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: strb w8, [sp, #153] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strb w9, [sp, #152] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: ldp w4, w7, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #151] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strb w9, [sp, #150] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldp w2, w3, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #149] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strb w9, [sp, #148] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldp w18, w0, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #147] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strb w9, [sp, #146] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldp w16, w17, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #145] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w9, [sp, #144] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #72] +; NONEON-NOSVE-NEXT: strb w8, [sp, #175] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w9, [sp, #174] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #64] +; NONEON-NOSVE-NEXT: strb w8, [sp, #173] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w9, [sp, #172] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #136] +; NONEON-NOSVE-NEXT: strb w8, [sp, #171] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #170] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #169] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w9, [sp, #168] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #167] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w9, [sp, #166] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: ldp w29, w30, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #165] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strb w9, [sp, #164] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #163] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #162] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w5, [sp, #159] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w6, [sp, #158] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w5, [sp, #157] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w6, [sp, #156] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #161] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #144] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #176] // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i32>, ptr %in %b = trunc <32 x i32> %a to <32 x i8> @@ -383,32 +1483,273 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #128] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #160] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #224] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: ldp q3, q1, [x0] -; NONEON-NOSVE-NEXT: uzp1 v4.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.8h, v7.8h, v6.8h -; NONEON-NOSVE-NEXT: ldp q16, q7, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp1 v5.8h, v17.8h, v5.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: uzp1 v7.8h, v16.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v19.8h, v18.8h -; NONEON-NOSVE-NEXT: uzp1 v2.16b, v4.16b, v6.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v1.16b, v7.16b -; NONEON-NOSVE-NEXT: uzp1 v3.16b, v5.16b, v3.16b -; NONEON-NOSVE-NEXT: add v2.16b, v2.16b, v2.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add v3.16b, v3.16b, v3.16b -; NONEON-NOSVE-NEXT: stp q1, q3, [x1] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #480 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] +; NONEON-NOSVE-NEXT: str x1, [sp, #152] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #128] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #224] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #160] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #288] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #316] +; NONEON-NOSVE-NEXT: str q18, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #304] +; NONEON-NOSVE-NEXT: stp q21, q19, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #292] +; NONEON-NOSVE-NEXT: add w20, w8, w8 +; NONEON-NOSVE-NEXT: stp q20, q23, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #288] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #312] +; NONEON-NOSVE-NEXT: stp q3, q17, [sp, #384] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #400] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #404] +; NONEON-NOSVE-NEXT: str q7, [sp, #160] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w18, [sp, #396] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #392] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #408] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #412] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #332] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #328] +; NONEON-NOSVE-NEXT: ldr w16, [sp, #324] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w1, [sp, #388] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #384] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #280] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #348] +; NONEON-NOSVE-NEXT: ldr w4, [sp, #344] +; NONEON-NOSVE-NEXT: ldr w5, [sp, #340] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w6, [sp, #336] +; NONEON-NOSVE-NEXT: stp q6, q5, [sp, #352] +; NONEON-NOSVE-NEXT: ldr w7, [sp, #380] +; NONEON-NOSVE-NEXT: ldr w19, [sp, #376] +; NONEON-NOSVE-NEXT: ldr w21, [sp, #372] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #368] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #364] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #360] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #356] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #352] +; NONEON-NOSVE-NEXT: strb w20, [sp, #463] +; NONEON-NOSVE-NEXT: add w20, w22, w22 +; NONEON-NOSVE-NEXT: strb w20, [sp, #462] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #240] +; NONEON-NOSVE-NEXT: ldp w29, w28, [sp, #168] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #248] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #260] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #264] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #268] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #176] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #184] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #224] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #192] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #200] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #208] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #216] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #300] +; NONEON-NOSVE-NEXT: ldp w8, w30, [sp, #160] +; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #308] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #461] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #460] +; NONEON-NOSVE-NEXT: add w8, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #459] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strb w8, [sp, #458] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #457] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w8, [sp, #456] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #455] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w8, [sp, #454] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #453] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w8, [sp, #452] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: strb w8, [sp, #451] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w8, [sp, #450] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #449] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #448] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w8, [sp, #447] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #446] +; NONEON-NOSVE-NEXT: add w8, w5, w5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #445] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: strb w8, [sp, #444] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #443] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: strb w8, [sp, #442] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: strb w8, [sp, #441] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: strb w8, [sp, #440] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #439] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: strb w8, [sp, #438] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #437] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: strb w8, [sp, #436] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strb w8, [sp, #435] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #434] +; NONEON-NOSVE-NEXT: add w8, w30, w30 +; NONEON-NOSVE-NEXT: strb w8, [sp, #433] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #432] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #431] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #430] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #429] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #428] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #427] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #426] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #425] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #424] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #423] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #422] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #421] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #420] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #419] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #418] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #417] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #416] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #416] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #479] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #478] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #477] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #476] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #475] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #474] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #473] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #472] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #471] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #470] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #469] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #468] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #467] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #466] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #465] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #148] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #464] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #152] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #448] +; NONEON-NOSVE-NEXT: stp q3, q2, [x8] +; NONEON-NOSVE-NEXT: stp q0, q1, [x8, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #480 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i8> @@ -435,7 +1776,21 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v8i32_v8i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = trunc <8 x i32> %a to <8 x i16> @@ -462,13 +1817,54 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q3, q1, [sp] +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w2, w3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w4, w5, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp w18, w0, [sp] +; NONEON-NOSVE-NEXT: ldp w16, w17, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strh w9, [sp, #76] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w5, w5 +; NONEON-NOSVE-NEXT: strh w9, [sp, #72] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strh w9, [sp, #68] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strh w9, [sp, #64] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strh w9, [sp, #92] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strh w9, [sp, #88] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strh w9, [sp, #84] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: strh w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i32>, ptr %in %b = trunc <16 x i32> %a to <16 x i16> @@ -508,20 +1904,115 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i32_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #304 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #224] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: ldp q6, q1, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v6.8h, v1.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v2.8h, v2.8h, v2.8h -; NONEON-NOSVE-NEXT: add v3.8h, v3.8h, v3.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #272] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #80] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp w27, w28, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w25, w26, [sp, #104] +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #288] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w10, w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp w23, w24, [sp, #96] +; NONEON-NOSVE-NEXT: ldp w21, w22, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #120] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w19, w20, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: strh w8, [sp, #182] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strh w9, [sp, #180] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: strh w8, [sp, #178] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strh w9, [sp, #176] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: ldp w4, w7, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strh w9, [sp, #172] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldp w2, w3, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strh w9, [sp, #168] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldp w18, w0, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strh w9, [sp, #164] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldp w16, w17, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strh w9, [sp, #160] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #72] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strh w9, [sp, #156] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strh w9, [sp, #152] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #136] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strh w9, [sp, #148] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strh w9, [sp, #144] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #206] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strh w9, [sp, #204] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: ldp w29, w30, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #202] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strh w9, [sp, #200] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #198] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w9, [sp, #196] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w5, [sp, #190] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w6, [sp, #188] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w5, [sp, #186] +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #144] +; NONEON-NOSVE-NEXT: strh w6, [sp, #184] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #288] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #194] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #272] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w9, [sp, #192] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #176] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q3, q2, [x1] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #304 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i32>, ptr %in %b = trunc <32 x i32> %a to <32 x i16> @@ -583,34 +2074,276 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #224] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #128] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: ldp q16, q1, [x0, #160] -; NONEON-NOSVE-NEXT: uzp1 v4.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.8h, v7.8h, v6.8h +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #528 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: mov x5, x1 +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #224] ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q18, q7, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v16.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp1 v5.8h, v17.8h, v5.8h -; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v4.8h, v4.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v7.8h, v18.8h, v7.8h -; NONEON-NOSVE-NEXT: add v3.8h, v6.8h, v6.8h -; NONEON-NOSVE-NEXT: uzp1 v6.8h, v17.8h, v16.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #96] -; NONEON-NOSVE-NEXT: add v0.8h, v5.8h, v5.8h -; NONEON-NOSVE-NEXT: add v2.8h, v2.8h, v2.8h -; NONEON-NOSVE-NEXT: add v4.8h, v7.8h, v7.8h -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #64] -; NONEON-NOSVE-NEXT: add v1.8h, v6.8h, v6.8h -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #32] -; NONEON-NOSVE-NEXT: stp q2, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #160] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #128] +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #332] +; NONEON-NOSVE-NEXT: stp q17, q23, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #320] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #192] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #328] +; NONEON-NOSVE-NEXT: add w21, w8, w8 +; NONEON-NOSVE-NEXT: stp q18, q20, [sp, #240] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #160] +; NONEON-NOSVE-NEXT: stp q7, q21, [sp, #368] +; NONEON-NOSVE-NEXT: str q19, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w29, [sp, #380] +; NONEON-NOSVE-NEXT: ldr w30, [sp, #376] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #168] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #288] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #336] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #300] +; NONEON-NOSVE-NEXT: ldr w4, [sp, #296] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w11, [sp, #360] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #356] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #352] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #348] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #344] +; NONEON-NOSVE-NEXT: str q3, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w16, [sp, #340] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #336] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w6, [sp, #292] +; NONEON-NOSVE-NEXT: ldr w7, [sp, #288] +; NONEON-NOSVE-NEXT: str q5, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #316] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #312] +; NONEON-NOSVE-NEXT: ldr w19, [sp, #284] +; NONEON-NOSVE-NEXT: ldr w20, [sp, #280] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #308] +; NONEON-NOSVE-NEXT: ldr w28, [sp, #304] +; NONEON-NOSVE-NEXT: strh w21, [sp, #494] +; NONEON-NOSVE-NEXT: add w21, w23, w23 +; NONEON-NOSVE-NEXT: strh w21, [sp, #492] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #216] +; NONEON-NOSVE-NEXT: ldp w0, w18, [sp, #152] +; NONEON-NOSVE-NEXT: ldp w2, w1, [sp, #144] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #176] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #184] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #192] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #200] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #384] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #388] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #392] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #396] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #260] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #264] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #268] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #224] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #240] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #248] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #368] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #372] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #364] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #490] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #488] +; NONEON-NOSVE-NEXT: add w8, w9, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #486] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #484] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #482] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #480] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #478] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #476] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #474] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strh w8, [sp, #472] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: strh w8, [sp, #470] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #468] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #466] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: strh w8, [sp, #464] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #462] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #460] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: strh w8, [sp, #458] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strh w8, [sp, #456] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: strh w8, [sp, #454] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strh w8, [sp, #452] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strh w8, [sp, #450] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #448] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: strh w8, [sp, #510] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strh w8, [sp, #508] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: strh w8, [sp, #506] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strh w8, [sp, #504] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strh w8, [sp, #502] +; NONEON-NOSVE-NEXT: add w8, w30, w30 +; NONEON-NOSVE-NEXT: strh w8, [sp, #500] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #464] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #498] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #496] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #446] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #444] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #442] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #440] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #438] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #436] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #434] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #432] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q6, q3, [sp, #432] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #526] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #524] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #522] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #520] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #518] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #516] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #514] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #512] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q4, q7, [sp, #496] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #414] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #412] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #410] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #408] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #406] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #404] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #402] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #400] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #430] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #428] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #426] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #424] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #422] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #420] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #418] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #416] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #400] +; NONEON-NOSVE-NEXT: stp q1, q0, [x5] +; NONEON-NOSVE-NEXT: stp q4, q3, [x5, #32] +; NONEON-NOSVE-NEXT: stp q7, q6, [x5, #64] +; NONEON-NOSVE-NEXT: stp q2, q5, [x5, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #528 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i16> @@ -639,8 +2372,15 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: strh w10, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in %b = trunc <4 x i64> %a to <4 x i8> @@ -669,12 +2409,27 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #48] +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: strb w10, [sp, #77] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w11, [sp, #75] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i64>, ptr %in %b = trunc <8 x i64> %a to <8 x i8> @@ -717,17 +2472,47 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v3.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q6, q7, [x0, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: str q3, [sp, #80] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: stp q7, q5, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #16] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #96] +; NONEON-NOSVE-NEXT: strb w9, [sp, #143] +; NONEON-NOSVE-NEXT: strb w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldp x8, x11, [sp, #48] +; NONEON-NOSVE-NEXT: strb w10, [sp, #141] +; NONEON-NOSVE-NEXT: strb w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #120] +; NONEON-NOSVE-NEXT: strb w11, [sp, #139] +; NONEON-NOSVE-NEXT: strb w8, [sp, #137] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #112] +; NONEON-NOSVE-NEXT: strb w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #72] +; NONEON-NOSVE-NEXT: strb w8, [sp, #135] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #64] +; NONEON-NOSVE-NEXT: strb w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #104] +; NONEON-NOSVE-NEXT: strb w8, [sp, #133] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #96] +; NONEON-NOSVE-NEXT: strb w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w8, [sp, #131] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #129] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #128] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i8> @@ -798,31 +2583,139 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #224] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #128] -; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #160] -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v16.4s, v17.4s, v16.4s -; NONEON-NOSVE-NEXT: uzp1 v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v7.4s, v19.4s, v18.4s -; NONEON-NOSVE-NEXT: uzp1 v6.4s, v21.4s, v20.4s -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v4.8h, v16.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v2.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v6.8h, v5.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: sub sp, sp, #416 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #336] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #64] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #32] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #128] +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #320] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #224] +; NONEON-NOSVE-NEXT: str x1, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #160] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #184] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #192] +; NONEON-NOSVE-NEXT: stp q21, q19, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #216] +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #192] +; NONEON-NOSVE-NEXT: stp q20, q23, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w16, [sp, #48] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldr w18, [sp, #96] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #128] +; NONEON-NOSVE-NEXT: stp q6, q5, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #128] +; NONEON-NOSVE-NEXT: stp q3, q17, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #240] +; NONEON-NOSVE-NEXT: ldr w21, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w28, [sp, #264] +; NONEON-NOSVE-NEXT: strb w9, [sp, #298] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #248] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #232] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #200] +; NONEON-NOSVE-NEXT: str q7, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #104] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #112] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w9, [sp, #296] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: str q18, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w19, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w20, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #299] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: ldr w4, [sp, #80] +; NONEON-NOSVE-NEXT: strb w9, [sp, #294] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldr w7, [sp, #88] +; NONEON-NOSVE-NEXT: strb w8, [sp, #297] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: ldr w17, [sp, #56] +; NONEON-NOSVE-NEXT: strb w9, [sp, #292] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #295] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: ldr w15, [sp, #136] +; NONEON-NOSVE-NEXT: strb w9, [sp, #290] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #120] +; NONEON-NOSVE-NEXT: strb w8, [sp, #293] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #152] +; NONEON-NOSVE-NEXT: strb w9, [sp, #288] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldr w1, [sp, #280] +; NONEON-NOSVE-NEXT: strb w8, [sp, #291] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: ldr w29, [sp, #160] +; NONEON-NOSVE-NEXT: strb w9, [sp, #318] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: ldr w30, [sp, #168] +; NONEON-NOSVE-NEXT: strb w8, [sp, #289] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w9, [sp, #316] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: strb w8, [sp, #319] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w9, [sp, #314] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #317] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w9, [sp, #312] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #315] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #310] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #313] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w9, [sp, #308] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #311] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w9, [sp, #306] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #309] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w5, [sp, #303] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: strb w6, [sp, #302] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #307] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strb w5, [sp, #301] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w6, [sp, #300] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #305] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #304] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #288] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #336] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q1, q0, [x8] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #320] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #416 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i8> @@ -850,8 +2743,15 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: strh w10, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in %b = trunc <4 x i64> %a to <4 x i16> @@ -879,11 +2779,27 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #78] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: strh w10, [sp, #74] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w11, [sp, #70] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i64>, ptr %in %b = trunc <8 x i64> %a to <8 x i16> @@ -925,19 +2841,66 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v1.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #64] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #104] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldr w4, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w5, [sp, #88] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w18, [sp] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #142] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldr w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #40] +; NONEON-NOSVE-NEXT: strh w9, [sp, #138] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldr w14, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #24] +; NONEON-NOSVE-NEXT: strh w9, [sp, #134] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #56] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: add w9, w17, w17 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #120] +; NONEON-NOSVE-NEXT: strh w9, [sp, #158] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strh w9, [sp, #154] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strh w9, [sp, #150] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strh w9, [sp, #146] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #128] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i16> @@ -1006,32 +2969,140 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #128] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #160] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #224] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: ldp q3, q1, [x0] -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: ldp q16, q7, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v3.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v5.4s, v17.4s, v5.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v7.4s, v16.4s, v7.4s -; NONEON-NOSVE-NEXT: uzp1 v3.4s, v19.4s, v18.4s -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v4.8h, v6.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v1.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v5.8h, v3.8h -; NONEON-NOSVE-NEXT: add v2.8h, v2.8h, v2.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add v3.8h, v3.8h, v3.8h -; NONEON-NOSVE-NEXT: stp q1, q3, [x1] +; NONEON-NOSVE-NEXT: sub sp, sp, #432 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #64] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #128] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #32] +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #416] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #336] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #224] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #160] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #168] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #176] +; NONEON-NOSVE-NEXT: stp q21, q19, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #192] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #200] +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #176] +; NONEON-NOSVE-NEXT: stp q20, q23, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #56] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldr w18, [sp, #80] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #112] +; NONEON-NOSVE-NEXT: stp q6, q5, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w16, [sp, #32] +; NONEON-NOSVE-NEXT: stp q3, q17, [sp, #240] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #232] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #240] +; NONEON-NOSVE-NEXT: ldr w28, [sp, #248] +; NONEON-NOSVE-NEXT: strh w9, [sp, #308] +; NONEON-NOSVE-NEXT: ldr w21, [sp, #208] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] +; NONEON-NOSVE-NEXT: str q7, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #216] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #40] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w9, [sp, #304] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: strh w8, [sp, #310] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: ldr w19, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #306] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: ldr w20, [sp, #24] +; NONEON-NOSVE-NEXT: str q18, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #120] +; NONEON-NOSVE-NEXT: strh w8, [sp, #302] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: ldr w4, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #300] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldr w7, [sp, #72] +; NONEON-NOSVE-NEXT: strh w8, [sp, #298] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #296] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #104] +; NONEON-NOSVE-NEXT: strh w8, [sp, #294] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #128] +; NONEON-NOSVE-NEXT: strh w9, [sp, #292] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #136] +; NONEON-NOSVE-NEXT: strh w8, [sp, #290] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: ldr w29, [sp, #144] +; NONEON-NOSVE-NEXT: strh w9, [sp, #288] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldr w30, [sp, #152] +; NONEON-NOSVE-NEXT: strh w8, [sp, #286] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strh w9, [sp, #284] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: strh w8, [sp, #282] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strh w9, [sp, #280] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: strh w8, [sp, #278] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strh w9, [sp, #276] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #274] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strh w9, [sp, #272] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #334] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strh w9, [sp, #332] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #330] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strh w9, [sp, #328] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #326] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w9, [sp, #324] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w5, [sp, #318] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: strh w6, [sp, #316] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w5, [sp, #314] +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #272] +; NONEON-NOSVE-NEXT: strh w6, [sp, #312] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #322] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w9, [sp, #320] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #304] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q3, q2, [x1] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #336] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q0, q1, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #432 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i16> @@ -1058,7 +3129,13 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in %b = trunc <4 x i64> %a to <4 x i32> @@ -1085,13 +3162,34 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q3, q1, [sp] +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w12, [sp] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #56] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i64>, ptr %in %b = trunc <8 x i64> %a to <8 x i32> @@ -1131,20 +3229,60 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: ldp q6, q1, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v6.4s, v1.4s -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v2.4s, v2.4s, v2.4s -; NONEON-NOSVE-NEXT: add v3.4s, v3.4s, v3.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #64] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #104] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldr w4, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w5, [sp, #88] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w18, [sp] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldr w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldr w14, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: add w9, w17, w17 +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #160] +; NONEON-NOSVE-NEXT: stp q3, q2, [x1] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i32> @@ -1206,34 +3344,145 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #224] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #128] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: ldp q16, q1, [x0, #160] -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #496 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #416] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #192] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #432] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #448] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #224] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #464] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q18, q7, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v16.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v5.4s, v17.4s, v5.4s -; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v4.4s, v4.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v7.4s, v18.4s, v7.4s -; NONEON-NOSVE-NEXT: add v3.4s, v6.4s, v6.4s -; NONEON-NOSVE-NEXT: uzp1 v6.4s, v17.4s, v16.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #96] -; NONEON-NOSVE-NEXT: add v0.4s, v5.4s, v5.4s -; NONEON-NOSVE-NEXT: add v2.4s, v2.4s, v2.4s -; NONEON-NOSVE-NEXT: add v4.4s, v7.4s, v7.4s -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #64] -; NONEON-NOSVE-NEXT: add v1.4s, v6.4s, v6.4s -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #32] -; NONEON-NOSVE-NEXT: stp q2, q1, [x1] +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #480] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #96] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #160] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #128] +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: stp q17, q23, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #200] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #48] +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp q18, q20, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #168] +; NONEON-NOSVE-NEXT: str q5, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w21, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #184] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #152] +; NONEON-NOSVE-NEXT: str q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w4, [sp, #112] +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #216] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #16] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldr w28, [sp, #24] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w9, [sp, #344] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: ldr w7, [sp, #120] +; NONEON-NOSVE-NEXT: stp q7, q21, [sp, #240] +; NONEON-NOSVE-NEXT: ldr w18, [sp, #128] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #136] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: ldr w19, [sp, #240] +; NONEON-NOSVE-NEXT: str w9, [sp, #336] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: ldr w20, [sp, #248] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: ldr w16, [sp, #256] +; NONEON-NOSVE-NEXT: str w9, [sp, #328] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldr w17, [sp, #264] +; NONEON-NOSVE-NEXT: str q19, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #72] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: ldr w2, [sp, #96] +; NONEON-NOSVE-NEXT: str w9, [sp, #320] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldr w3, [sp, #104] +; NONEON-NOSVE-NEXT: str w8, [sp, #380] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #56] +; NONEON-NOSVE-NEXT: str w9, [sp, #376] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #80] +; NONEON-NOSVE-NEXT: str w8, [sp, #372] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #88] +; NONEON-NOSVE-NEXT: str w9, [sp, #368] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldr w29, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: ldr w30, [sp, #232] +; NONEON-NOSVE-NEXT: str w9, [sp, #312] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: str w9, [sp, #304] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: str w8, [sp, #396] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: str w9, [sp, #392] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: str w8, [sp, #388] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: str w9, [sp, #384] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: str w9, [sp, #280] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: str w9, [sp, #272] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: str w9, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w5, [sp, #364] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w6, [sp, #360] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: str w5, [sp, #356] +; NONEON-NOSVE-NEXT: ldp q6, q3, [sp, #304] +; NONEON-NOSVE-NEXT: str w6, [sp, #352] +; NONEON-NOSVE-NEXT: ldp q4, q7, [sp, #368] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #336] +; NONEON-NOSVE-NEXT: str w9, [sp, #288] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #480] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #272] +; NONEON-NOSVE-NEXT: stp q4, q3, [x1, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #464] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q7, q6, [x1, #64] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #448] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #96] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #432] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #496 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll index 874af15e211177..323f5f56a2c085 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -18,8 +18,17 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v1.8b, v0.8b, v0.8b, #6 -; NONEON-NOSVE-NEXT: trn1 v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldur w8, [sp, #2] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: str w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> ret <4 x i8> %ret @@ -38,7 +47,19 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #7 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sturh w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stur w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> ret <8 x i8> %ret @@ -57,7 +78,20 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #15 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sturh w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: stur w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> @@ -80,11 +114,35 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #15 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #15 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: sturh w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stur w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: stur x8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: sturh w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: stur w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: stur x8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -107,7 +165,12 @@ define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> ret <2 x i16> %ret @@ -126,7 +189,17 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #6 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stur w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> ret <4 x i16> %ret @@ -145,7 +218,18 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: stur w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> ret <8 x i16> %ret @@ -167,11 +251,31 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #14 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stur w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: stur x8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: stur w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: stur x8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -194,7 +298,13 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> ret <2 x i32> %ret @@ -213,7 +323,16 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> ret <4 x i32> %ret @@ -235,11 +354,26 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #12 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: stur x8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: str w9, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -261,7 +395,12 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #8] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> ret <2 x i64> %ret @@ -283,11 +422,20 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #8 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #40] +; NONEON-NOSVE-NEXT: stp x10, x9, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -309,7 +457,17 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #6 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: stur w8, [sp, #26] +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> ret <4 x half> %ret @@ -327,7 +485,18 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: stur w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: stur x8, [sp, #34] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> ret <8 x half> %ret @@ -347,11 +516,31 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #14 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: stur w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: stur x8, [sp, #18] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: stur w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: stur x8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -373,7 +562,13 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2) ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> ret <2 x float> %ret @@ -391,7 +586,16 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2) ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: str s0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: stur x8, [sp, #36] +; NONEON-NOSVE-NEXT: str s0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> ret <4 x float> %ret @@ -411,11 +615,26 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #12 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str s0, [sp, #28] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: stur x8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: str s0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: str s1, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: str s0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -436,7 +655,12 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #8] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> ret <2 x double> %ret @@ -456,11 +680,20 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #8 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: stp d2, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -483,11 +716,21 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_reverse: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v2.16b, #8 -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll index e69f59aedc026f..67cdde718e391f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll @@ -43,7 +43,8 @@ define <2 x i64> @fixed_vec_zero_constant() { ; ; NONEON-NOSVE-LABEL: fixed_vec_zero_constant: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] ; NONEON-NOSVE-NEXT: ret ret <2 x i64> zeroinitializer } @@ -57,7 +58,8 @@ define <2 x double> @fixed_vec_fp_zero_constant() { ; ; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] ; NONEON-NOSVE-NEXT: ret ret <2 x double> } From a72a90677d2b320e3bca553698e99143034387d9 Mon Sep 17 00:00:00 2001 From: Tulio Magno Quites Machado Filho Date: Wed, 29 May 2024 06:19:17 -0300 Subject: [PATCH 083/230] [Nomination] Add an extra Red Hat representative to the security group (#92174) I'd like to nominate myself as another Red Hat representative. I work at the LLVM team at Red Hat contributing to upstream and downstream. --- llvm/docs/Security.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst index 9140923e5e8c9d..a468ff51d2a6aa 100644 --- a/llvm/docs/Security.rst +++ b/llvm/docs/Security.rst @@ -55,6 +55,7 @@ username for an individual isn't available, the brackets will be empty. * Serge Guelton (Mozilla) [@serge-sans-paille] * Shayne Hiet-Block (Microsoft) [@GreatKeeper] * Tim Penge (Sony) [] +* Tulio Magno Quites Machado Filho (Red Hat) [@tuliom] * Will Huhn (Intel) [@wphuhn-intel] Criteria From 0f7b4b04a548e10d0f552f13bebc21972d55d7f6 Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Wed, 29 May 2024 17:30:14 +0800 Subject: [PATCH 084/230] [X86][Driver] Enable feature ccmp,nf for -mapxf This is follow-up for #78901 after validation. --- clang/include/clang/Driver/Options.td | 8 +++----- clang/lib/Basic/Targets/X86.cpp | 2 +- clang/test/Driver/x86-target-features.c | 4 ++-- clang/test/Preprocessor/x86_target_features.c | 2 +- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index de2f245fb29f8e..4119e69c85540e 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6277,11 +6277,9 @@ def mapx_features_EQ : CommaJoined<["-"], "mapx-features=">, Group, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">; def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, Group, HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">; -// Features egpr, push2pop2, ppx and ndd are validated with llvm-test-suite && cpu2017 on Intel SDE. -// For stability, we turn on these features only for -mapxf. After a feature pass the validation, -// we will add it to -mapxf. -def mapxf : Flag<["-"], "mapxf">, Alias, AliasArgs<["egpr","push2pop2","ppx", "ndd"]>; -def mno_apxf : Flag<["-"], "mno-apxf">, Alias, AliasArgs<["egpr","push2pop2","ppx","ndd"]>; +// For stability, we only add a feature to -mapxf after it passes the validation of llvm-test-suite && cpu2017 on Intel SDE. +def mapxf : Flag<["-"], "mapxf">, Alias, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf"]>; +def mno_apxf : Flag<["-"], "mno-apxf">, Alias, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf"]>; } // let Flags = [TargetSpecific] // VE feature flags diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 3a30cff917bb4f..08e44360bfbe38 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -961,7 +961,7 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasCF) Builder.defineMacro("__CF__"); // Condition here is aligned with the feature set of mapxf in Options.td - if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD) + if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD && HasCCMP && HasNF) Builder.defineMacro("__APX_F__"); // Each case falls through to the previous one here. diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 1d5f001c23fcc0..3022ed1250d590 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -423,8 +423,8 @@ // RUN: %clang -target x86_64-unknown-linux-gnu -mno-apxf -mapxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=APXF %s // RUN: %clang -target x86_64-unknown-linux-gnu -mapxf -mno-apxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-APXF %s // -// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" -// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd" +// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" +// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf" // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR %s // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=push2pop2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 7567267be26b42..6c08b379c93860 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -754,7 +754,7 @@ // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ccmp -x c -E -dM -o - %s | FileCheck --check-prefix=CCMP %s // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=nf -x c -E -dM -o - %s | FileCheck --check-prefix=NF %s // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=cf -x c -E -dM -o - %s | FileCheck --check-prefix=CF %s -// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,APXF %s +// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,CCMP,NF,APXF %s // APXF: #define __APX_F__ 1 // CCMP: #define __CCMP__ 1 // CF: #define __CF__ 1 From f3fb7f569936db418feef98e4ae68777a9a4cd2a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 May 2024 10:31:40 +0100 Subject: [PATCH 085/230] [X86] x86-atomic-float.c - cleanup unused check prefixes --- clang/test/CodeGen/X86/x86-atomic-float.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/clang/test/CodeGen/X86/x86-atomic-float.c b/clang/test/CodeGen/X86/x86-atomic-float.c index 2d3c72d2a0299f..6ee441c2dd7a8c 100644 --- a/clang/test/CodeGen/X86/x86-atomic-float.c +++ b/clang/test/CodeGen/X86/x86-atomic-float.c @@ -1,11 +1,11 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 -// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK64 %s -// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK32 %s +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s // CHECK-LABEL: define dso_local i32 @test_int_inc( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: entry: +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add ptr @test_int_inc.n, i32 1 seq_cst, align 4 // CHECK-NEXT: ret i32 [[TMP0]] // @@ -17,7 +17,7 @@ int test_int_inc() // CHECK-LABEL: define dso_local float @test_float_post_inc( // CHECK-SAME: ) #[[ATTR0]] { -// CHECK-NEXT: entry: +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_float_post_inc.n, float 1.000000e+00 seq_cst, align 4 // CHECK-NEXT: ret float [[TMP0]] // @@ -29,7 +29,7 @@ float test_float_post_inc() // CHECK-LABEL: define dso_local float @test_float_post_dc( // CHECK-SAME: ) #[[ATTR0]] { -// CHECK-NEXT: entry: +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_float_post_dc.n, float 1.000000e+00 seq_cst, align 4 // CHECK-NEXT: ret float [[TMP0]] // @@ -41,7 +41,7 @@ float test_float_post_dc() // CHECK-LABEL: define dso_local float @test_float_pre_dc( // CHECK-SAME: ) #[[ATTR0]] { -// CHECK-NEXT: entry: +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_float_pre_dc.n, float 1.000000e+00 seq_cst, align 4 // CHECK-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00 // CHECK-NEXT: ret float [[TMP1]] @@ -54,7 +54,7 @@ float test_float_pre_dc() // CHECK-LABEL: define dso_local float @test_float_pre_inc( // CHECK-SAME: ) #[[ATTR0]] { -// CHECK-NEXT: entry: +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_float_pre_inc.n, float 1.000000e+00 seq_cst, align 4 // CHECK-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00 // CHECK-NEXT: ret float [[TMP1]] @@ -64,6 +64,3 @@ float test_float_pre_inc() static _Atomic float n; return ++n; } -//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -// CHECK32: {{.*}} -// CHECK64: {{.*}} From 4bb6974a87e495f19faea4b13475a65e842473f0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 May 2024 10:32:49 +0100 Subject: [PATCH 086/230] [X86] x86-atomic-long_double.c - cleanup check prefixes --- .../test/CodeGen/X86/x86-atomic-long_double.c | 573 +++++++++--------- 1 file changed, 287 insertions(+), 286 deletions(-) diff --git a/clang/test/CodeGen/X86/x86-atomic-long_double.c b/clang/test/CodeGen/X86/x86-atomic-long_double.c index 74a22d5db151eb..2c3f381f13511e 100644 --- a/clang/test/CodeGen/X86/x86-atomic-long_double.c +++ b/clang/test/CodeGen/X86/x86-atomic-long_double.c @@ -1,170 +1,171 @@ -// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefix=CHECK32 %s +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck --check-prefixes=X64 %s +// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck --check-prefixes=X86 %s -// CHECK-LABEL: define dso_local x86_fp80 @testinc( -// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 -// CHECK-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 -// CHECK-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret x86_fp80 [[TMP3]] +// X64-LABEL: define dso_local x86_fp80 @testinc( +// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { +// X64-NEXT: [[ENTRY:.*:]] +// X64-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 +// X64-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 +// X64-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 +// X64-NEXT: ret x86_fp80 [[TMP3]] // -// CHECK32-LABEL: define dso_local x86_fp80 @testinc( -// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK32-NEXT: entry: -// CHECK32-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 -// CHECK32-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 -// CHECK32-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 -// CHECK32-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 4 -// CHECK32-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 -// CHECK32-NEXT: ret x86_fp80 [[TMP3]] +// X86-LABEL: define dso_local x86_fp80 @testinc( +// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 +// X86-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 +// X86-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 +// X86-NEXT: ret x86_fp80 [[TMP3]] // long double testinc(_Atomic long double *addr) { return ++*addr; } -// CHECK-LABEL: define dso_local x86_fp80 @testdec( -// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 -// CHECK-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret x86_fp80 [[TMP2]] +// X64-LABEL: define dso_local x86_fp80 @testdec( +// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*:]] +// X64-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 +// X64-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 16 +// X64-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 +// X64-NEXT: ret x86_fp80 [[TMP2]] // -// CHECK32-LABEL: define dso_local x86_fp80 @testdec( -// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK32-NEXT: entry: -// CHECK32-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 -// CHECK32-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 -// CHECK32-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 -// CHECK32-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 -// CHECK32-NEXT: ret x86_fp80 [[TMP2]] +// X86-LABEL: define dso_local x86_fp80 @testdec( +// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 +// X86-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 +// X86-NEXT: ret x86_fp80 [[TMP2]] // long double testdec(_Atomic long double *addr) { return (*addr)--; } -// CHECK-LABEL: define dso_local x86_fp80 @testcompassign( -// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16 -// CHECK-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16 -// CHECK-NEXT: br label [[ATOMIC_OP:%.*]] -// CHECK: atomic_op: -// CHECK-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[ATOMIC_OP]] ] -// CHECK-NEXT: [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false) -// CHECK-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false) -// CHECK-NEXT: store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16 -// CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16 -// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0 -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1 -// CHECK-NEXT: store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16 -// CHECK-NEXT: [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16 -// CHECK-NEXT: br i1 [[TMP7]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]] -// CHECK: atomic_cont: -// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[ATOMIC_LOAD4:%.*]] = load atomic i128, ptr [[TMP9]] seq_cst, align 16 -// CHECK-NEXT: store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16 -// CHECK-NEXT: ret x86_fp80 [[TMP10]] +// X64-LABEL: define dso_local x86_fp80 @testcompassign( +// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*]]: +// X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: br label %[[ATOMIC_OP:.*]] +// X64: [[ATOMIC_OP]]: +// X64-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ] +// X64-NEXT: [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16 +// X64-NEXT: [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0 +// X64-NEXT: [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1 +// X64-NEXT: store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X64: [[ATOMIC_CONT]]: +// X64-NEXT: [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[ATOMIC_LOAD4:%.*]] = load atomic i128, ptr [[TMP9]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16 +// X64-NEXT: [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16 +// X64-NEXT: ret x86_fp80 [[TMP10]] // -// CHECK32-LABEL: define dso_local x86_fp80 @testcompassign( -// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK32-NEXT: entry: -// CHECK32-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) -// CHECK32-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4 -// CHECK32-NEXT: br label [[ATOMIC_OP:%.*]] -// CHECK32: atomic_op: -// CHECK32-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[ATOMIC_OP]] ] -// CHECK32-NEXT: [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000 -// CHECK32-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false) -// CHECK32-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4 -// CHECK32-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false) -// CHECK32-NEXT: store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4 -// CHECK32-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5) -// CHECK32-NEXT: [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 -// CHECK32-NEXT: br i1 [[CALL]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]] -// CHECK32: atomic_cont: -// CHECK32-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5) -// CHECK32-NEXT: [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4 -// CHECK32-NEXT: ret x86_fp80 [[TMP5]] +// X86-LABEL: define dso_local x86_fp80 @testcompassign( +// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*]]: +// X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) +// X86-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4 +// X86-NEXT: br label %[[ATOMIC_OP:.*]] +// X86: [[ATOMIC_OP]]: +// X86-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ] +// X86-NEXT: [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4 +// X86-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5) +// X86-NEXT: [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X86: [[ATOMIC_CONT]]: +// X86-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5) +// X86-NEXT: [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4 +// X86-NEXT: ret x86_fp80 [[TMP5]] // long double testcompassign(_Atomic long double *addr) { *addr -= 25; return *addr; } -// CHECK-LABEL: define dso_local x86_fp80 @testassign( -// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false) -// CHECK-NEXT: store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16 -// CHECK-NEXT: store atomic i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP2]] seq_cst, align 16 -// CHECK-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16 -// CHECK-NEXT: ret x86_fp80 [[TMP3]] +// X64-LABEL: define dso_local x86_fp80 @testassign( +// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*:]] +// X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: store atomic i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16 +// X64-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP2]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: ret x86_fp80 [[TMP3]] // -// CHECK32-LABEL: define dso_local x86_fp80 @testassign( -// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK32-NEXT: entry: -// CHECK32-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false) -// CHECK32-NEXT: store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4 -// CHECK32-NEXT: call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) -// CHECK32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5) -// CHECK32-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 -// CHECK32-NEXT: ret x86_fp80 [[TMP2]] +// X86-LABEL: define dso_local x86_fp80 @testassign( +// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4 +// X86-NEXT: call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5) +// X86-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: ret x86_fp80 [[TMP2]] // long double testassign(_Atomic long double *addr) { *addr = 115; @@ -172,168 +173,168 @@ long double testassign(_Atomic long double *addr) { return *addr; } -// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_inc( -// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 -// CHECK-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 -// CHECK-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret x86_fp80 [[TMP3]] +// X64-LABEL: define dso_local x86_fp80 @test_volatile_inc( +// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*:]] +// X64-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 +// X64-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 +// X64-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 +// X64-NEXT: ret x86_fp80 [[TMP3]] // -// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_inc( -// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK32-NEXT: entry: -// CHECK32-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 -// CHECK32-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 -// CHECK32-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 -// CHECK32-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 4 -// CHECK32-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 -// CHECK32-NEXT: ret x86_fp80 [[TMP3]] +// X86-LABEL: define dso_local x86_fp80 @test_volatile_inc( +// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 +// X86-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 +// X86-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 +// X86-NEXT: ret x86_fp80 [[TMP3]] // long double test_volatile_inc(volatile _Atomic long double *addr) { return ++*addr; } -// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_dec( -// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 -// CHECK-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret x86_fp80 [[TMP2]] +// X64-LABEL: define dso_local x86_fp80 @test_volatile_dec( +// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*:]] +// X64-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 +// X64-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 16 +// X64-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 +// X64-NEXT: ret x86_fp80 [[TMP2]] // -// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_dec( -// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK32-NEXT: entry: -// CHECK32-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 -// CHECK32-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 -// CHECK32-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 -// CHECK32-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 -// CHECK32-NEXT: ret x86_fp80 [[TMP2]] +// X86-LABEL: define dso_local x86_fp80 @test_volatile_dec( +// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 +// X86-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 +// X86-NEXT: ret x86_fp80 [[TMP2]] // long double test_volatile_dec(volatile _Atomic long double *addr) { return (*addr)--; } -// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_compassign( -// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16 -// CHECK-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16 -// CHECK-NEXT: br label [[ATOMIC_OP:%.*]] -// CHECK: atomic_op: -// CHECK-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[ATOMIC_OP]] ] -// CHECK-NEXT: [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false) -// CHECK-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false) -// CHECK-NEXT: store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16 -// CHECK-NEXT: [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16 -// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0 -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1 -// CHECK-NEXT: store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16 -// CHECK-NEXT: [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16 -// CHECK-NEXT: br i1 [[TMP7]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]] -// CHECK: atomic_cont: -// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[ATOMIC_LOAD4:%.*]] = load atomic volatile i128, ptr [[TMP9]] seq_cst, align 16 -// CHECK-NEXT: store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16 -// CHECK-NEXT: ret x86_fp80 [[TMP10]] +// X64-LABEL: define dso_local x86_fp80 @test_volatile_compassign( +// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*]]: +// X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: br label %[[ATOMIC_OP:.*]] +// X64: [[ATOMIC_OP]]: +// X64-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ] +// X64-NEXT: [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16 +// X64-NEXT: [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0 +// X64-NEXT: [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1 +// X64-NEXT: store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X64: [[ATOMIC_CONT]]: +// X64-NEXT: [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[ATOMIC_LOAD4:%.*]] = load atomic volatile i128, ptr [[TMP9]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16 +// X64-NEXT: [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16 +// X64-NEXT: ret x86_fp80 [[TMP10]] // -// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_compassign( -// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK32-NEXT: entry: -// CHECK32-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) -// CHECK32-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4 -// CHECK32-NEXT: br label [[ATOMIC_OP:%.*]] -// CHECK32: atomic_op: -// CHECK32-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[ATOMIC_OP]] ] -// CHECK32-NEXT: [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000 -// CHECK32-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false) -// CHECK32-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4 -// CHECK32-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false) -// CHECK32-NEXT: store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4 -// CHECK32-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5) -// CHECK32-NEXT: [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 -// CHECK32-NEXT: br i1 [[CALL]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]] -// CHECK32: atomic_cont: -// CHECK32-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5) -// CHECK32-NEXT: [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4 -// CHECK32-NEXT: ret x86_fp80 [[TMP5]] +// X86-LABEL: define dso_local x86_fp80 @test_volatile_compassign( +// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*]]: +// X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) +// X86-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4 +// X86-NEXT: br label %[[ATOMIC_OP:.*]] +// X86: [[ATOMIC_OP]]: +// X86-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ] +// X86-NEXT: [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4 +// X86-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5) +// X86-NEXT: [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X86: [[ATOMIC_CONT]]: +// X86-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5) +// X86-NEXT: [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4 +// X86-NEXT: ret x86_fp80 [[TMP5]] // long double test_volatile_compassign(volatile _Atomic long double *addr) { *addr -= 25; return *addr; } -// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_assign( -// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 -// CHECK-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false) -// CHECK-NEXT: store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16 -// CHECK-NEXT: store atomic volatile i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// CHECK-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP2]] seq_cst, align 16 -// CHECK-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16 -// CHECK-NEXT: ret x86_fp80 [[TMP3]] +// X64-LABEL: define dso_local x86_fp80 @test_volatile_assign( +// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*:]] +// X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: store atomic volatile i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16 +// X64-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 +// X64-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP2]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: ret x86_fp80 [[TMP3]] // -// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_assign( -// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// CHECK32-NEXT: entry: -// CHECK32-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 -// CHECK32-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false) -// CHECK32-NEXT: store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4 -// CHECK32-NEXT: call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) -// CHECK32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// CHECK32-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5) -// CHECK32-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 -// CHECK32-NEXT: ret x86_fp80 [[TMP2]] +// X86-LABEL: define dso_local x86_fp80 @test_volatile_assign( +// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4 +// X86-NEXT: call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5) +// X86-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: ret x86_fp80 [[TMP2]] // long double test_volatile_assign(volatile _Atomic long double *addr) { *addr = 115; From 9c42ed1371ee8c211aedcfe8aed16662a9befb69 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 May 2024 10:34:49 +0100 Subject: [PATCH 087/230] [X86] Add x86-atomic-double.c double test coverage --- clang/test/CodeGen/X86/x86-atomic-double.c | 104 +++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 clang/test/CodeGen/X86/x86-atomic-double.c diff --git a/clang/test/CodeGen/X86/x86-atomic-double.c b/clang/test/CodeGen/X86/x86-atomic-double.c new file mode 100644 index 00000000000000..2354c89cc2b170 --- /dev/null +++ b/clang/test/CodeGen/X86/x86-atomic-double.c @@ -0,0 +1,104 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=X64 %s +// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=X86 %s + + +// X64-LABEL: define dso_local double @test_double_post_inc( +// X64-SAME: ) #[[ATTR0:[0-9]+]] { +// X64-NEXT: entry: +// X64-NEXT: [[RETVAL:%.*]] = alloca double, align 8 +// X64-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8 +// X64-NEXT: store float [[TMP0]], ptr [[RETVAL]], align 8 +// X64-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8 +// X64-NEXT: ret double [[TMP1]] +// +// X86-LABEL: define dso_local double @test_double_post_inc( +// X86-SAME: ) #[[ATTR0:[0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[RETVAL:%.*]] = alloca double, align 4 +// X86-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8 +// X86-NEXT: store float [[TMP0]], ptr [[RETVAL]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4 +// X86-NEXT: ret double [[TMP1]] +// +double test_double_post_inc() +{ + static _Atomic double n; + return n++; +} + +// X64-LABEL: define dso_local double @test_double_post_dc( +// X64-SAME: ) #[[ATTR0]] { +// X64-NEXT: entry: +// X64-NEXT: [[RETVAL:%.*]] = alloca double, align 8 +// X64-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8 +// X64-NEXT: store float [[TMP0]], ptr [[RETVAL]], align 8 +// X64-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8 +// X64-NEXT: ret double [[TMP1]] +// +// X86-LABEL: define dso_local double @test_double_post_dc( +// X86-SAME: ) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[RETVAL:%.*]] = alloca double, align 4 +// X86-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8 +// X86-NEXT: store float [[TMP0]], ptr [[RETVAL]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4 +// X86-NEXT: ret double [[TMP1]] +// +double test_double_post_dc() +{ + static _Atomic double n; + return n--; +} + +// X64-LABEL: define dso_local double @test_double_pre_dc( +// X64-SAME: ) #[[ATTR0]] { +// X64-NEXT: entry: +// X64-NEXT: [[RETVAL:%.*]] = alloca double, align 8 +// X64-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8 +// X64-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00 +// X64-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 8 +// X64-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8 +// X64-NEXT: ret double [[TMP2]] +// +// X86-LABEL: define dso_local double @test_double_pre_dc( +// X86-SAME: ) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[RETVAL:%.*]] = alloca double, align 4 +// X86-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8 +// X86-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00 +// X86-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4 +// X86-NEXT: ret double [[TMP2]] +// +double test_double_pre_dc() +{ + static _Atomic double n; + return --n; +} + +// X64-LABEL: define dso_local double @test_double_pre_inc( +// X64-SAME: ) #[[ATTR0]] { +// X64-NEXT: entry: +// X64-NEXT: [[RETVAL:%.*]] = alloca double, align 8 +// X64-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8 +// X64-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00 +// X64-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 8 +// X64-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8 +// X64-NEXT: ret double [[TMP2]] +// +// X86-LABEL: define dso_local double @test_double_pre_inc( +// X86-SAME: ) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[RETVAL:%.*]] = alloca double, align 4 +// X86-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8 +// X86-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00 +// X86-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4 +// X86-NEXT: ret double [[TMP2]] +// +double test_double_pre_inc() +{ + static _Atomic double n; + return ++n; +} From f42de69213890f1203c1c3418a962e50de4ed73c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 May 2024 10:37:46 +0100 Subject: [PATCH 088/230] [X86] vector-shuffle-512-v16.ll - add fast shuffle test coverage --- .../CodeGen/X86/vector-shuffle-512-v16.ll | 181 ++++++++++++------ 1 file changed, 125 insertions(+), 56 deletions(-) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index c981d973fef3ed..bad0b411f68a95 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512F +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512BW +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512F +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512BW target triple = "x86_64-unknown-unknown" @@ -14,21 +16,33 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 } define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) { -; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: -; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 -; ALL-NEXT: retq +; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: +; SLOW: # %bb.0: +; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; SLOW-NEXT: vbroadcastss %xmm0, %zmm0 +; SLOW-NEXT: retq +; +; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: +; FAST: # %bb.0: +; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; FAST-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle } define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) { -; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc: -; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 -; ALL-NEXT: retq +; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc: +; SLOW: # %bb.0: +; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; SLOW-NEXT: vbroadcastss %xmm0, %zmm0 +; SLOW-NEXT: retq +; +; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc: +; FAST: # %bb.0: +; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; FAST-NEXT: retq %tmp0 = bitcast <16 x i32> %a to <16 x float> %tmp1 = bitcast <16 x i32> %b to <16 x float> %shuffle = shufflevector <16 x float> %tmp0, <16 x float> %tmp1, <16 x i32> @@ -196,11 +210,20 @@ define <16 x float> @shuffle_f32_v16f32_00_08_01_09_02_10_03_11_04_12_05_13_06_1 ; PR86076 define <16 x float> @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08(float %a0, float %a1) { -; ALL-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08: -; ALL: # %bb.0: -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 -; ALL-NEXT: retq +; SLOW-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08: +; SLOW: # %bb.0: +; SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SLOW-NEXT: vbroadcastsd %xmm0, %zmm0 +; SLOW-NEXT: retq +; +; FAST-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08: +; FAST: # %bb.0: +; FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16] +; FAST-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; FAST-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; FAST-NEXT: retq %v0 = insertelement <8 x float> poison, float %a0, i64 0 %v1 = insertelement <8 x float> poison, float %a1, i64 0 %sv = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> @@ -217,11 +240,17 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 } define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) { -; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 -; ALL-NEXT: retq +; SLOW-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04: +; SLOW: # %bb.0: +; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; SLOW-NEXT: vbroadcastss %xmm0, %zmm0 +; SLOW-NEXT: retq +; +; FAST-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04: +; FAST: # %bb.0: +; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; FAST-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle } @@ -302,21 +331,33 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08 ; PR46249 define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x i32> %a) { -; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04: -; ALL: # %bb.0: -; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3] -; ALL-NEXT: retq +; SLOW-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04: +; SLOW: # %bb.0: +; SLOW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3] +; SLOW-NEXT: retq +; +; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04: +; FAST: # %bb.0: +; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4] +; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; FAST-NEXT: retq %1 = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> ret <16 x i32> %1 } define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x float> %a) { -; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04: -; ALL: # %bb.0: -; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3] -; ALL-NEXT: retq +; SLOW-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04: +; SLOW: # %bb.0: +; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; SLOW-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3] +; SLOW-NEXT: retq +; +; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04: +; FAST: # %bb.0: +; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4] +; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; FAST-NEXT: retq %1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> ret <16 x float> %1 } @@ -333,11 +374,17 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_ } define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a0, ptr %a1) { -; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04: -; ALL: # %bb.0: -; ALL-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12] -; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14] -; ALL-NEXT: retq +; SLOW-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04: +; SLOW: # %bb.0: +; SLOW-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12] +; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14] +; SLOW-NEXT: retq +; +; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04: +; FAST: # %bb.0: +; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28] +; FAST-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0 +; FAST-NEXT: retq %1 = load <16 x float>, ptr %a1 %2 = shufflevector <16 x float> %1, <16 x float> %a0, <16 x i32> ret <16 x float> %2 @@ -365,26 +412,41 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a ;FIXME: can do better with vpcompress define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) { -; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15: -; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; ALL-NEXT: retq +; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15: +; SLOW: # %bb.0: +; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; SLOW-NEXT: retq +; +; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15: +; FAST: # %bb.0: +; FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; FAST-NEXT: retq %res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> ret <8 x i32> %res } ;FIXME: can do better with vpcompress define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) { -; ALL-LABEL: test_v16i32_0_1_2_12: -; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vbroadcastss %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; SLOW-LABEL: test_v16i32_0_1_2_12: +; SLOW: # %bb.0: +; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; SLOW-NEXT: vbroadcastss %xmm1, %xmm1 +; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; SLOW-NEXT: vzeroupper +; SLOW-NEXT: retq +; +; FAST-LABEL: test_v16i32_0_1_2_12: +; FAST: # %bb.0: +; FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,2,12] +; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; FAST-NEXT: vzeroupper +; FAST-NEXT: retq %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> ret <4 x i32> %res } @@ -568,11 +630,18 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12 } define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) { -; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 -; ALL-NEXT: retq +; SLOW-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04: +; SLOW: # %bb.0: +; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; SLOW-NEXT: vbroadcastss %xmm0, %zmm0 +; SLOW-NEXT: retq +; +; FAST-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04: +; FAST: # %bb.0: +; FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> ret <16 x float> %shuffle } From 74014b5a3497c1e9c7f0652d26f78fffea9bf51c Mon Sep 17 00:00:00 2001 From: Lu Weining Date: Wed, 29 May 2024 17:39:38 +0800 Subject: [PATCH 089/230] Fix typo in AMDGPUUsage. NFC (#93652) The vendor name is mesa but not mesa3d. --- llvm/docs/AMDGPUUsage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 1004956ac8f103..b827524e6b8db4 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -64,7 +64,7 @@ to specify the target triple: Vendor Description ============ ============================================================== ``amd`` Can be used for all AMD GPU usage. - ``mesa3d`` Can be used if the OS is ``mesa3d``. + ``mesa`` Can be used if the OS is ``mesa3d``. ============ ============================================================== .. table:: AMDGPU Operating Systems From dc8da7ddeaa595a34827fc9e39322a8109e177f0 Mon Sep 17 00:00:00 2001 From: Pankaj Dwivedi <167427157+PankajDwivedi-25@users.noreply.github.com> Date: Wed, 29 May 2024 15:10:44 +0530 Subject: [PATCH 090/230] [AMDGPU] Reserved private memory register during PEI (#93536) - Reserved newly selected private memory registers in entry Function Prologue generation. - Added assertion patch in eliminateFrameIndex to ensure register is reserved. Co-authored-by: PankajDwivedi-25 --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 1 + llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index eae666ab0e7d77..97a8ff44866095 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -579,6 +579,7 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) { MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); + MRI.reserveReg(Reg, TRI); return Reg; } } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ddb5f719356855..4b5f9bdd82b8db 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2083,6 +2083,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); + assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) && + "unreserved scratch RSRC register"); + MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); From 1594cebedd60a08f408e3fa975116ef4db86bf9b Mon Sep 17 00:00:00 2001 From: Simon Camphausen Date: Wed, 29 May 2024 11:42:06 +0200 Subject: [PATCH 091/230] [mlir][EmitC] Fix evaluation order of expressions (#93549) Expressions with the same precedence were not parenthesized and therefore were possibly evaluated in the wrong order depending on the shape of the expression tree. --------- Co-authored-by: Matthias Gehre Co-authored-by: Corentin Ferry --- mlir/lib/Target/Cpp/TranslateToCpp.cpp | 6 +++++- mlir/test/Target/Cpp/expressions.mlir | 23 ++++++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp index 7db7163bac4ab6..f19e0f8c4c2a42 100644 --- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp +++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp @@ -1316,7 +1316,11 @@ LogicalResult CppEmitter::emitOperand(Value value) { FailureOr precedence = getOperatorPrecedence(def); if (failed(precedence)) return failure(); - bool encloseInParenthesis = precedence.value() < getExpressionPrecedence(); + + // Sub-expressions with equal or lower precedence need to be parenthesized, + // as they might be evaluated in the wrong order depending on the shape of + // the expression tree. + bool encloseInParenthesis = precedence.value() <= getExpressionPrecedence(); if (encloseInParenthesis) { os << "("; pushExpressionPrecedence(lowestPrecedence()); diff --git a/mlir/test/Target/Cpp/expressions.mlir b/mlir/test/Target/Cpp/expressions.mlir index 2eda58902cb1d1..aaddd5af874a91 100644 --- a/mlir/test/Target/Cpp/expressions.mlir +++ b/mlir/test/Target/Cpp/expressions.mlir @@ -65,15 +65,15 @@ func.func @do_not_inline(%arg0: i32, %arg1: i32, %arg2 : i32) -> i32 { return %e : i32 } -// CPP-DEFAULT: float paranthesis_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { +// CPP-DEFAULT: float parentheses_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { // CPP-DEFAULT-NEXT: return (float) ([[VAL_1]] + [[VAL_2]] * [[VAL_3]]); // CPP-DEFAULT-NEXT: } -// CPP-DECLTOP: float paranthesis_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { +// CPP-DECLTOP: float parentheses_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { // CPP-DECLTOP-NEXT: return (float) ([[VAL_1]] + [[VAL_2]] * [[VAL_3]]); // CPP-DECLTOP-NEXT: } -func.func @paranthesis_for_low_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> f32 { +func.func @parentheses_for_low_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> f32 { %e = emitc.expression : f32 { %a = emitc.add %arg0, %arg1 : (i32, i32) -> i32 %b = emitc.mul %a, %arg2 : (i32, i32) -> i32 @@ -83,6 +83,23 @@ func.func @paranthesis_for_low_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> return %e : f32 } +// CPP-DEFAULT: int32_t parentheses_for_same_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { +// CPP-DEFAULT-NEXT: return [[VAL_3]] / ([[VAL_1]] * [[VAL_2]]); +// CPP-DEFAULT-NEXT: } + +// CPP-DECLTOP: int32_t parentheses_for_same_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { +// CPP-DECLTOP-NEXT: return [[VAL_3]] / ([[VAL_1]] * [[VAL_2]]); +// CPP-DECLTOP-NEXT: } +func.func @parentheses_for_same_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 { + %e = emitc.expression : i32 { + %0 = emitc.mul %arg0, %arg1 : (i32, i32) -> i32 + %1 = emitc.div %arg2, %0 : (i32, i32) -> i32 + emitc.yield %1 : i32 + } + + return %e : i32 +} + // CPP-DEFAULT: int32_t multiple_uses(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]], int32_t [[VAL_4:v[0-9]+]]) { // CPP-DEFAULT-NEXT: bool [[VAL_5:v[0-9]+]] = bar([[VAL_1]] * [[VAL_2]], [[VAL_3]]) - [[VAL_4]] < [[VAL_2]]; // CPP-DEFAULT-NEXT: int32_t [[VAL_6:v[0-9]+]]; From 5553f27d5a45e702415fa2f91d842bf4a1f4a8b5 Mon Sep 17 00:00:00 2001 From: Lu Weining Date: Wed, 29 May 2024 17:42:41 +0800 Subject: [PATCH 092/230] [AMDGPU][test] Fix the wrong triples in lower-work-group-id-intrinsics-{hsa,pal}.ll. NFC (#93501) - hsa -> amdhsa - Use amdgcn-amd-amd{hsa,pal} for lower-work-group-id-intrinsics-{hsa,pal}.ll respectively --- .../lower-work-group-id-intrinsics-hsa.ll | 170 +++++++----------- .../lower-work-group-id-intrinsics-pal.ll | 65 +++---- 2 files changed, 88 insertions(+), 147 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index 9547f08d3eba6b..1429251fc64211 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -1,17 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_ids_kernel() { ; GFX9-LABEL: workgroup_ids_kernel: ; GFX9: ; %bb.0: ; %.entry -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -72,27 +72,20 @@ define amdgpu_kernel void @workgroup_ids_kernel() { define amdgpu_kernel void @caller() { ; GFX9-SDAG-LABEL: caller: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-SDAG-NEXT: s_mov_b32 s38, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s7 -; GFX9-SDAG-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-SDAG-NEXT: s_add_u32 s8, s2, 36 -; GFX9-SDAG-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX9-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 -; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 -; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x0 -; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-SDAG-NEXT: s_add_u32 flat_scratch_lo, s10, s13 +; GFX9-SDAG-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s13 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-SDAG-NEXT: s_getpc_b64 s[8:9] +; GFX9-SDAG-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9-SDAG-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-SDAG-NEXT: s_mov_b32 s12, s6 -; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15] @@ -100,27 +93,20 @@ define amdgpu_kernel void @caller() { ; ; GFX9-GISEL-LABEL: caller: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-GISEL-NEXT: s_mov_b32 s38, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s7 -; GFX9-GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-GISEL-NEXT: s_add_u32 s8, s2, 36 -; GFX9-GISEL-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-GISEL-NEXT: s_getpc_b64 s[0:1] -; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 -; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 +; GFX9-GISEL-NEXT: s_add_u32 flat_scratch_lo, s10, s13 +; GFX9-GISEL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s13 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-GISEL-NEXT: s_getpc_b64 s[8:9] +; GFX9-GISEL-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9-GISEL-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-GISEL-NEXT: s_mov_b32 s12, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] @@ -128,81 +114,61 @@ define amdgpu_kernel void @caller() { ; ; GFX9ARCH-SDAG-LABEL: caller: ; GFX9ARCH-SDAG: ; %bb.0: -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s38, -1 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s6 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s37, s37, 0 -; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s2, 36 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s3, 0 -; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX9ARCH-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 -; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-SDAG-NEXT: s_add_u32 flat_scratch_lo, s10, s12 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s0, s0, s12 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[8:9] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9ARCH-SDAG-NEXT: s_endpgm ; ; GFX9ARCH-GISEL-LABEL: caller: ; GFX9ARCH-GISEL: ; %bb.0: -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s38, -1 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s6 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s2, 36 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s3, 0 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] -; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 flat_scratch_lo, s10, s12 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, s12 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[8:9] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9ARCH-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: caller: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi -; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo -; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: caller: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo -; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi -; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: caller: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_add_co_u32 s4, s4, callee@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, callee@gotpcrel32@hi+16 +; GFX12-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-NEXT: s_mov_b32 s32, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() call void @callee(i32 %idx) #0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll index 14fe4e5f48c67c..8009f917aef5a7 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s @@ -67,62 +67,37 @@ define amdgpu_cs void @_amdgpu_cs_main() { } define amdgpu_cs void @caller() { -; GFX9-LABEL: caller: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9] -; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm -; ; GFX9ARCH-SDAG-LABEL: caller: ; GFX9ARCH-SDAG: ; %bb.0: -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s10, -1 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[8:9] +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s8, s0 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s5, callee@abs32@hi +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s4, callee@abs32@lo +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s8, s0 ; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s9, 0 -; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[0:1] -; GFX9ARCH-SDAG-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9] ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9ARCH-SDAG-NEXT: s_endpgm ; ; GFX9ARCH-GISEL-LABEL: caller: ; GFX9ARCH-GISEL: ; %bb.0: -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s10, -1 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[8:9] +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s8, s0 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s4, callee@abs32@lo +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s5, callee@abs32@hi +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s8, s0 ; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s9, 0 -; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] -; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9] -; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9ARCH-GISEL-NEXT: s_endpgm ; From 78cc9cbba23fd1783a9b233ae745f126ece56cc7 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 29 May 2024 10:44:58 +0100 Subject: [PATCH 093/230] [AArch64][SME] Add intrinsics for multi-vector BFCLAMP (#93532) According to the specification in https://github.com/ARM-software/acle/pull/309 this adds the intrinsics ``` svbfloat16x2_t svclamp[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming; svbfloat16x4_t svclamp[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming; ``` These are available only if __ARM_FEATURE_SME_B16B16 is enabled. --- clang/include/clang/Basic/arm_sve.td | 5 ++ .../aarch64-sme2-intrinsics/acle_sme2_clamp.c | 74 +++++++++++++++++-- .../acle_sme2_b16b16.cpp | 13 ++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 2 + .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 6 ++ .../AArch64/sve2p1-intrinsics-bfclamp.ll | 26 ++++++- 6 files changed, 120 insertions(+), 6 deletions(-) create mode 100644 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 03570f94de6666..078ef576342a7c 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2151,6 +2151,11 @@ let TargetGuard = "sme2" in { def SVFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "hfd", MergeNone, "aarch64_sve_fclamp_single_x4", [IsStreaming], []>; } +let TargetGuard = "sme2,b16b16"in { + def SVBFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]", "22dd", "b", MergeNone, "aarch64_sve_bfclamp_single_x2", [IsStreaming], []>; + def SVBFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "b", MergeNone, "aarch64_sve_bfclamp_single_x4", [IsStreaming], []>; +} + let TargetGuard = "sme2" in { // == ADD (vectors) == def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>; diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c index 57ea4d2a1ac47a..21a8229bbf244e 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c @@ -1,14 +1,14 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \ // RUN: -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \ // RUN: -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \ // RUN: -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \ // RUN: -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \ // RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include @@ -745,3 +745,67 @@ svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t op1, svfloat32_t op2, svf svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, svfloat64_t op3) __arm_streaming { return SVE_ACLE_FUNC(svclamp, _single_f64_x4, , )(op1, op2, op3); } + +// CHECK-LABEL: @test_svclamp_single_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_tS0_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svbfloat16x2_t test_svclamp_single_bf16_x2(svbfloat16x2_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming { + return SVE_ACLE_FUNC(svclamp, _single_bf16_x2, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_single_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[OP1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_tS0_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[OP1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svbfloat16x4_t test_svclamp_single_bf16_x4(svbfloat16x4_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming { + return SVE_ACLE_FUNC(svclamp, _single_bf16_x4, , )(op1, op2, op3); +} diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp new file mode 100644 index 00000000000000..62a1f8e6de1d79 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp @@ -0,0 +1,13 @@ +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -emit-llvm-only -verify -verify-ignore-unexpected=error,note -o - %s + +#include + +void test_b16b16( svbfloat16_t bf16, svbfloat16x2_t bf16x2, svbfloat16x4_t bf16x4) __arm_streaming +{ + // expected-error@+1 {{'svclamp_single_bf16_x2' needs target feature sme2,b16b16}} + svclamp_single_bf16_x2(bf16x2, bf16, bf16); + // expected-error@+1 {{'svclamp_single_bf16_x4' needs target feature sme2,b16b16}} + svclamp_single_bf16_x4(bf16x4, bf16, bf16); +} \ No newline at end of file diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 4544cf35fb7b37..57d0dfb698b383 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3472,10 +3472,12 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_sclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic; def int_aarch64_sve_uclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic; def int_aarch64_sve_fclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic; + def int_aarch64_sve_bfclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic; def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic; def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic; def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic; + def int_aarch64_sve_bfclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic; // // Multi-vector add/sub and accumulate into ZA diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 25f2e4d7c4de63..660675cf8f3895 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5738,6 +5738,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::FCLAMP_VG2_2Z2Z_D})) SelectClamp(Node, 2, Op); return; + case Intrinsic::aarch64_sve_bfclamp_single_x2: + SelectClamp(Node, 2, AArch64::BFCLAMP_VG2_2ZZZ_H); + return; case Intrinsic::aarch64_sve_sclamp_single_x4: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5759,6 +5762,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::FCLAMP_VG4_4Z4Z_D})) SelectClamp(Node, 4, Op); return; + case Intrinsic::aarch64_sve_bfclamp_single_x4: + SelectClamp(Node, 4, AArch64::BFCLAMP_VG4_4ZZZ_H); + return; case Intrinsic::aarch64_sve_add_single_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll index 61b67755a35441..7934f831a7e62f 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s define @bfclamp( %a, %b, %c){ ; CHECK-LABEL: bfclamp: @@ -11,3 +11,27 @@ define @bfclamp( %a, @llvm.aarch64.sve.fclamp.nxv8bf16(, , ) + +define { , } @test_bfclamp_single_x2_f16( %a, %b, %c, %d){ +; CHECK-LABEL: test_bfclamp_single_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfclamp { z0.h, z1.h }, z2.h, z3.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16( %a, %b, %c, %d) + ret { , } %res +} + +define { , , , } @test_bfclamp_single_x4_f16( %a, %b, %c, %d, %e, %f){ +; CHECK-LABEL: test_bfclamp_single_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfclamp { z0.h - z3.h }, z4.h, z5.h +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16( %a, %b, %c, %d, %e, %f) + ret { , , , } %res +} From e1aa8ad6faa1524f12338ca58d1eadfde6f29f34 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 29 May 2024 11:58:59 +0200 Subject: [PATCH 094/230] [flang][OpenMP] Fix bug in emitting `dealloc` logic (#93641) Fixes a bug in emiting deacllocation logic when delayed privatization is disabled. I introduced the bug when implementing delayed privatization for allocatables: when delayed privatization is disabled the deacllocation ops are emitted for only one allocatable variables. --- .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 2 +- .../OpenMP/allocatable-multiple-vars.f90 | 28 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 flang/test/Lower/OpenMP/allocatable-multiple-vars.f90 diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index b722e19272ca11..557a9685024c5e 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -86,7 +86,7 @@ void DataSharingProcessor::insertDeallocs() { if (semantics::IsAllocatable(sym->GetUltimate())) { if (!useDelayedPrivatization) { converter.createHostAssociateVarCloneDealloc(*sym); - return; + continue; } lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym); diff --git a/flang/test/Lower/OpenMP/allocatable-multiple-vars.f90 b/flang/test/Lower/OpenMP/allocatable-multiple-vars.f90 new file mode 100644 index 00000000000000..e6450a13e13a05 --- /dev/null +++ b/flang/test/Lower/OpenMP/allocatable-multiple-vars.f90 @@ -0,0 +1,28 @@ +! Test early privatization for multiple allocatable variables. + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization=false \ +! RUN: -o - %s 2>&1 | FileCheck %s + +! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization=false -o - %s 2>&1 |\ +! RUN: FileCheck %s + +subroutine delayed_privatization_allocatable + implicit none + integer, allocatable :: var1, var2 + +!$omp parallel private(var1, var2) + var1 = 10 + var2 = 20 +!$omp end parallel +end subroutine + +! Verify that private versions of each variable are both allocated and freed +! within the parallel region. + +! CHECK: omp.parallel { +! CHECK: fir.allocmem +! CHECK: fir.allocmem +! CHECK: fir.freemem +! CHECK: fir.freemem +! CHECK: omp.terminator +! CHECK-NEXT: } From 5c214eb0c628c874f2c9496e663be4067e64442a Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Wed, 29 May 2024 12:05:05 +0200 Subject: [PATCH 095/230] [Inline] Clone return range attribute on the callsite into inlined call (#92666) --- clang/test/Headers/__clang_hip_math.hip | 6 +- llvm/lib/Transforms/Utils/InlineFunction.cpp | 13 +++- .../Inline/ret_attr_align_and_noundef.ll | 73 +++++++++++++++++++ 3 files changed, 88 insertions(+), 4 deletions(-) diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip index 1271868a53b866..26da82843c5124 100644 --- a/clang/test/Headers/__clang_hip_math.hip +++ b/clang/test/Headers/__clang_hip_math.hip @@ -231,7 +231,7 @@ extern "C" __device__ uint64_t test___make_mantissa(const char *p) { // CHECK-LABEL: @test_abs( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, -2147483648) i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) // CHECK-NEXT: ret i32 [[TMP0]] // extern "C" __device__ int test_abs(int x) { @@ -240,7 +240,7 @@ extern "C" __device__ int test_abs(int x) { // CHECK-LABEL: @test_labs( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) +// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) // CHECK-NEXT: ret i64 [[TMP0]] // extern "C" __device__ long test_labs(long x) { @@ -249,7 +249,7 @@ extern "C" __device__ long test_labs(long x) { // CHECK-LABEL: @test_llabs( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) +// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) // CHECK-NEXT: ret i64 [[TMP0]] // extern "C" __device__ long long test_llabs(long x) { diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 7b846f2d2d72d6..eb471b259c7d4e 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -30,11 +30,12 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" -#include "llvm/IR/AttributeMask.h" #include "llvm/IR/Argument.h" +#include "llvm/IR/AttributeMask.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -1450,6 +1451,8 @@ static AttrBuilder IdentifyValidPoisonGeneratingAttributes(CallBase &CB) { Valid.addAttribute(Attribute::NonNull); if (CB.hasRetAttr(Attribute::Alignment)) Valid.addAlignmentAttr(CB.getRetAlign()); + if (std::optional Range = CB.getRange()) + Valid.addRangeAttr(*Range); return Valid; } @@ -1541,6 +1544,14 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) { if (ValidPG.getAlignment().valueOrOne() < AL.getRetAlignment().valueOrOne()) ValidPG.removeAttribute(Attribute::Alignment); if (ValidPG.hasAttributes()) { + Attribute CBRange = ValidPG.getAttribute(Attribute::Range); + if (CBRange.isValid()) { + Attribute NewRange = AL.getRetAttr(Attribute::Range); + if (NewRange.isValid()) { + ValidPG.addRangeAttr( + CBRange.getRange().intersectWith(NewRange.getRange())); + } + } // Three checks. // If the callsite has `noundef`, then a poison due to violating the // return attribute will create UB anyways so we can always propagate. diff --git a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll index c038ffccf3e96d..f4cebf1fcb5da0 100644 --- a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll +++ b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll @@ -5,10 +5,12 @@ declare ptr @foo() declare void @use.ptr(ptr) willreturn nounwind +declare void @use.val(i8) willreturn nounwind declare void @bar() declare void @baz() declare ptr @llvm.ptrmask.p0.i64(ptr, i64) declare i1 @val() +declare i8 @val8() define ptr @callee0123() { ; CHECK-LABEL: define ptr @callee0123() { @@ -337,3 +339,74 @@ define ptr @caller12_todo() { %r = call nonnull ptr @callee12() ret ptr %r } + +define i8 @callee13() { +; CHECK-LABEL: define i8 @callee13() { +; CHECK-NEXT: [[R:%.*]] = call i8 @val8() +; CHECK-NEXT: ret i8 [[R]] +; + %r = call i8 @val8() + ret i8 %r +} + +define i8 @caller13_okay_use_after_poison_anyways() { +; CHECK-LABEL: define i8 @caller13_okay_use_after_poison_anyways() { +; CHECK-NEXT: [[R_I:%.*]] = call range(i8 0, 10) i8 @val8() +; CHECK-NEXT: call void @use.val(i8 [[R_I]]) +; CHECK-NEXT: ret i8 [[R_I]] +; + %r = call range(i8 0, 10) i8 @callee13() + call void @use.val(i8 %r) + ret i8 %r +} + +define i8 @callee14() { +; CHECK-LABEL: define i8 @callee14() { +; CHECK-NEXT: [[R:%.*]] = call noundef i8 @val8() +; CHECK-NEXT: ret i8 [[R]] +; + %r = call noundef i8 @val8() + ret i8 %r +} + +define i8 @caller14_fail_creates_ub() { +; CHECK-LABEL: define i8 @caller14_fail_creates_ub() { +; CHECK-NEXT: [[R_I:%.*]] = call noundef i8 @val8() +; CHECK-NEXT: call void @use.val(i8 [[R_I]]) +; CHECK-NEXT: ret i8 [[R_I]] +; + %r = call range(i8 0, 10) i8 @callee14() + call void @use.val(i8 %r) + ret i8 %r +} + +define i8 @caller14_okay_is_ub_anyways() { +; CHECK-LABEL: define i8 @caller14_okay_is_ub_anyways() { +; CHECK-NEXT: [[R_I:%.*]] = call noundef range(i8 0, 10) i8 @val8() +; CHECK-NEXT: call void @use.val(i8 [[R_I]]) +; CHECK-NEXT: ret i8 [[R_I]] +; + %r = call noundef range(i8 0, 10) i8 @callee14() + call void @use.val(i8 %r) + ret i8 %r +} + +define i8 @callee15() { +; CHECK-LABEL: define i8 @callee15() { +; CHECK-NEXT: [[R:%.*]] = call range(i8 5, 10) i8 @val8() +; CHECK-NEXT: ret i8 [[R]] +; + %r = call range(i8 5, 10) i8 @val8() + ret i8 %r +} + +define i8 @caller15_okay_intersect_ranges() { +; CHECK-LABEL: define i8 @caller15_okay_intersect_ranges() { +; CHECK-NEXT: [[R_I:%.*]] = call range(i8 5, 7) i8 @val8() +; CHECK-NEXT: call void @use.val(i8 [[R_I]]) +; CHECK-NEXT: ret i8 [[R_I]] +; + %r = call range(i8 0, 7) i8 @callee15() + call void @use.val(i8 %r) + ret i8 %r +} From 971f1aaad3ca3680bfbab76212f498ca15b280a2 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 29 May 2024 10:05:43 +0000 Subject: [PATCH 096/230] [lldb][Test][Windows] Fix flaky address range API tests The new tests added in #92014 have been flaky on Linaro's Windows on Arm bot. They appear to be hitting a deadlock trying to clean up the test process. This only happens in async mode and I don't see why this test case needs async mode, so the simple workaround is to stick to sync mode. --- lldb/test/API/python_api/address_range/TestAddressRange.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lldb/test/API/python_api/address_range/TestAddressRange.py b/lldb/test/API/python_api/address_range/TestAddressRange.py index 8c27558af4752d..65221e3f1b0e91 100644 --- a/lldb/test/API/python_api/address_range/TestAddressRange.py +++ b/lldb/test/API/python_api/address_range/TestAddressRange.py @@ -15,8 +15,6 @@ def setUp(self): self.build() exe = self.getBuildArtifact("a.out") - self.dbg.SetAsync(True) - self.target = self.dbg.CreateTarget(exe) self.assertTrue(self.target, VALID_TARGET) self.launch_info = self.target.GetLaunchInfo() From 3bcccb6af685c3132a9ee578b9e11b2503c35a5c Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 29 May 2024 18:09:23 +0800 Subject: [PATCH 097/230] [Reassociate] Drop weight reduction to fix issue 91417 (#91469) See the following case: https://alive2.llvm.org/ce/z/A-fBki ``` define i3 @src(i3 %0) { %2 = mul i3 %0, %0 %3 = mul i3 %2, %0 %4 = mul i3 %3, %0 %5 = mul nsw i3 %4, %0 ret i3 %5 } define i3 @tgt(i3 %0) { %2 = mul i3 %0, %0 %5 = mul nsw i3 %2, %0 ret i3 %5 } ``` https://github.com/llvm/llvm-project/commit/d7aeefebd6b049f017711cd7c6ef5f217a17b673 introduced weight reduction during weights combination of the same operand. As the weight of `%0` changes from 5 to 3, the nsw flag in `%5` should be dropped. However, the nsw flag isn't cleared by `RewriteExprTree` since `%5 = mul nsw i3 %0, %4` is not included in the range of `[ExpressionChangedStart, ExpressionChangedEnd)`. ``` Calculated Rank[] = 3 Combine negations for: %2 = mul i3 %0, %0 Calculated Rank[] = 4 Combine negations for: %3 = mul i3 %0, %2 Calculated Rank[] = 5 Combine negations for: %4 = mul i3 %0, %3 Calculated Rank[] = 6 Combine negations for: %5 = mul nsw i3 %0, %4 LINEARIZE: %5 = mul nsw i3 %0, %4 OPERAND: i3 %0 (1) ADD USES LEAF: i3 %0 (1) OPERAND: %4 = mul i3 %0, %3 (1) DIRECT ADD: %4 = mul i3 %0, %3 (1) OPERAND: i3 %0 (1) OPERAND: %3 = mul i3 %0, %2 (1) DIRECT ADD: %3 = mul i3 %0, %2 (1) OPERAND: i3 %0 (1) OPERAND: %2 = mul i3 %0, %0 (1) DIRECT ADD: %2 = mul i3 %0, %0 (1) OPERAND: i3 %0 (1) OPERAND: i3 %0 (1) RAIn: mul i3 [ %0, #3] [ %0, #3] [ %0, #3] RAOut: mul i3 [ %0, #3] [ %0, #3] [ %0, #3] RAOut after CSE reorder: mul i3 [ %0, #3] [ %0, #3] [ %0, #3] RA: %5 = mul nsw i3 %0, %4 TO: %5 = mul nsw i3 %4, %0 RA: %4 = mul i3 %0, %3 TO: %4 = mul i3 %0, %0 ``` The best way to fix this is to inform `RewriteExprTree` to clear flags of the whole expr tree when weight reduction happens. But I find that weight reduction based on Carmichael number never happens in practice. See the coverage result https://dtcxzyw.github.io/llvm-opt-benchmark/coverage/home/dtcxzyw/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp.html#L323 I think it would be better to drop `IncorporateWeight`. Fixes #91417 --- llvm/lib/Transforms/Scalar/Reassociate.cpp | 112 +----------- llvm/test/Transforms/Reassociate/repeats.ll | 187 +++++++++++++------- 2 files changed, 126 insertions(+), 173 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index c903e47a93cafd..04c54ed69e93f1 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -302,97 +302,6 @@ static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { return Res; } -/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael -/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for -/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic. -/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every -/// even x in Bitwidth-bit arithmetic. -static unsigned CarmichaelShift(unsigned Bitwidth) { - if (Bitwidth < 3) - return Bitwidth - 1; - return Bitwidth - 2; -} - -/// Add the extra weight 'RHS' to the existing weight 'LHS', -/// reducing the combined weight using any special properties of the operation. -/// The existing weight LHS represents the computation X op X op ... op X where -/// X occurs LHS times. The combined weight represents X op X op ... op X with -/// X occurring LHS + RHS times. If op is "Xor" for example then the combined -/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even; -/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second. -static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { - // If we were working with infinite precision arithmetic then the combined - // weight would be LHS + RHS. But we are using finite precision arithmetic, - // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct - // for nilpotent operations and addition, but not for idempotent operations - // and multiplication), so it is important to correctly reduce the combined - // weight back into range if wrapping would be wrong. - - // If RHS is zero then the weight didn't change. - if (RHS.isMinValue()) - return; - // If LHS is zero then the combined weight is RHS. - if (LHS.isMinValue()) { - LHS = RHS; - return; - } - // From this point on we know that neither LHS nor RHS is zero. - - if (Instruction::isIdempotent(Opcode)) { - // Idempotent means X op X === X, so any non-zero weight is equivalent to a - // weight of 1. Keeping weights at zero or one also means that wrapping is - // not a problem. - assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); - return; // Return a weight of 1. - } - if (Instruction::isNilpotent(Opcode)) { - // Nilpotent means X op X === 0, so reduce weights modulo 2. - assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); - LHS = 0; // 1 + 1 === 0 modulo 2. - return; - } - if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) { - // TODO: Reduce the weight by exploiting nsw/nuw? - LHS += RHS; - return; - } - - assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) && - "Unknown associative operation!"); - unsigned Bitwidth = LHS.getBitWidth(); - // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth - // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth - // bit number x, since either x is odd in which case x^CM = 1, or x is even in - // which case both x^W and x^(W - CM) are zero. By subtracting off multiples - // of CM like this weights can always be reduced to the range [0, CM+Bitwidth) - // which by a happy accident means that they can always be represented using - // Bitwidth bits. - // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than - // the Carmichael number). - if (Bitwidth > 3) { - /// CM - The value of Carmichael's lambda function. - APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth)); - // Any weight W >= Threshold can be replaced with W - CM. - APInt Threshold = CM + Bitwidth; - assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!"); - // For Bitwidth 4 or more the following sum does not overflow. - LHS += RHS; - while (LHS.uge(Threshold)) - LHS -= CM; - } else { - // To avoid problems with overflow do everything the same as above but using - // a larger type. - unsigned CM = 1U << CarmichaelShift(Bitwidth); - unsigned Threshold = CM + Bitwidth; - assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold && - "Weights not reduced!"); - unsigned Total = LHS.getZExtValue() + RHS.getZExtValue(); - while (Total >= Threshold) - Total -= CM; - LHS = Total; - } -} - using RepeatedValue = std::pair; /// Given an associative binary expression, return the leaf @@ -562,26 +471,7 @@ static bool LinearizeExprTree(Instruction *I, "In leaf map but not visited!"); // Update the number of paths to the leaf. - IncorporateWeight(It->second, Weight, Opcode); - -#if 0 // TODO: Re-enable once PR13021 is fixed. - // The leaf already has one use from inside the expression. As we want - // exactly one such use, drop this new use of the leaf. - assert(!Op->hasOneUse() && "Only one use, but we got here twice!"); - I->setOperand(OpIdx, UndefValue::get(I->getType())); - Changed = true; - - // If the leaf is a binary operation of the right kind and we now see - // that its multiple original uses were in fact all by nodes belonging - // to the expression, then no longer consider it to be a leaf and add - // its operands to the expression. - if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { - LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); - Worklist.push_back(std::make_pair(BO, It->second)); - Leaves.erase(It); - continue; - } -#endif + It->second += Weight; // If we still have uses that are not accounted for by the expression // then it is not safe to modify the value. diff --git a/llvm/test/Transforms/Reassociate/repeats.ll b/llvm/test/Transforms/Reassociate/repeats.ll index c18db19fa73e35..28177f1c0ba5ee 100644 --- a/llvm/test/Transforms/Reassociate/repeats.ll +++ b/llvm/test/Transforms/Reassociate/repeats.ll @@ -1,56 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt < %s -passes=reassociate -S | FileCheck %s ; Tests involving repeated operations on the same value. define i8 @nilpotent(i8 %x) { -; CHECK-LABEL: @nilpotent( +; CHECK-LABEL: define i8 @nilpotent( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i8 0 +; %tmp = xor i8 %x, %x ret i8 %tmp -; CHECK: ret i8 0 } define i2 @idempotent(i2 %x) { -; CHECK-LABEL: @idempotent( +; CHECK-LABEL: define i2 @idempotent( +; CHECK-SAME: i2 [[X:%.*]]) { +; CHECK-NEXT: ret i2 -1 +; %tmp1 = and i2 %x, %x %tmp2 = and i2 %tmp1, %x %tmp3 = and i2 %tmp2, %x ret i2 %tmp3 -; CHECK: ret i2 %x } define i2 @add(i2 %x) { -; CHECK-LABEL: @add( +; CHECK-LABEL: define i2 @add( +; CHECK-SAME: i2 [[X:%.*]]) { +; CHECK-NEXT: ret i2 0 +; %tmp1 = add i2 %x, %x %tmp2 = add i2 %tmp1, %x %tmp3 = add i2 %tmp2, %x ret i2 %tmp3 -; CHECK: ret i2 0 } define i2 @cst_add() { -; CHECK-LABEL: @cst_add( +; CHECK-LABEL: define i2 @cst_add() { +; CHECK-NEXT: ret i2 -1 +; %tmp1 = add i2 1, 1 %tmp2 = add i2 %tmp1, 1 ret i2 %tmp2 -; CHECK: ret i2 -1 } define i8 @cst_mul() { -; CHECK-LABEL: @cst_mul( +; CHECK-LABEL: define i8 @cst_mul() { +; CHECK-NEXT: ret i8 -13 +; %tmp1 = mul i8 3, 3 %tmp2 = mul i8 %tmp1, 3 %tmp3 = mul i8 %tmp2, 3 %tmp4 = mul i8 %tmp3, 3 ret i8 %tmp4 -; CHECK: ret i8 -13 } define i3 @foo3x5(i3 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo3x5( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i3 @foo3x5( +; CHECK-SAME: i3 [[X:%.*]]) { +; CHECK-NEXT: [[TMP3:%.*]] = mul i3 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i3 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i3 [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret i3 [[TMP5]] +; %tmp1 = mul i3 %x, %x %tmp2 = mul i3 %tmp1, %x %tmp3 = mul i3 %tmp2, %x @@ -58,12 +70,31 @@ define i3 @foo3x5(i3 %x) { ret i3 %tmp4 } +define i3 @foo3x5_nsw(i3 %x) { +; Can be done with two multiplies. +; CHECK-LABEL: define i3 @foo3x5_nsw( +; CHECK-SAME: i3 [[X:%.*]]) { +; CHECK-NEXT: [[TMP3:%.*]] = mul i3 [[X]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i3 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i3 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i3 [[TMP4]] +; + %tmp1 = mul i3 %x, %x + %tmp2 = mul i3 %tmp1, %x + %tmp3 = mul i3 %tmp2, %x + %tmp4 = mul nsw i3 %tmp3, %x + ret i3 %tmp4 +} + define i3 @foo3x6(i3 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo3x6( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i3 @foo3x6( +; CHECK-SAME: i3 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i3 [[X]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i3 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i3 [[TMP3]], [[TMP3]] +; CHECK-NEXT: ret i3 [[TMP2]] +; %tmp1 = mul i3 %x, %x %tmp2 = mul i3 %tmp1, %x %tmp3 = mul i3 %tmp2, %x @@ -74,10 +105,14 @@ define i3 @foo3x6(i3 %x) { define i3 @foo3x7(i3 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo3x7( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i3 @foo3x7( +; CHECK-SAME: i3 [[X:%.*]]) { +; CHECK-NEXT: [[TMP5:%.*]] = mul i3 [[X]], [[X]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i3 [[TMP5]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i3 [[TMP7]], [[X]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i3 [[TMP3]], [[TMP7]] +; CHECK-NEXT: ret i3 [[TMP6]] +; %tmp1 = mul i3 %x, %x %tmp2 = mul i3 %tmp1, %x %tmp3 = mul i3 %tmp2, %x @@ -89,10 +124,13 @@ define i3 @foo3x7(i3 %x) { define i4 @foo4x8(i4 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo4x8( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x8( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP3]], [[TMP3]] +; CHECK-NEXT: ret i4 [[TMP4]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -105,11 +143,14 @@ define i4 @foo4x8(i4 %x) { define i4 @foo4x9(i4 %x) { ; Can be done with three multiplies. -; CHECK-LABEL: @foo4x9( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x9( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i4 [[TMP3]], [[TMP2]] +; CHECK-NEXT: ret i4 [[TMP8]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -123,11 +164,14 @@ define i4 @foo4x9(i4 %x) { define i4 @foo4x10(i4 %x) { ; Can be done with three multiplies. -; CHECK-LABEL: @foo4x10( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x10( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]] +; CHECK-NEXT: ret i4 [[TMP3]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -142,12 +186,15 @@ define i4 @foo4x10(i4 %x) { define i4 @foo4x11(i4 %x) { ; Can be done with four multiplies. -; CHECK-LABEL: @foo4x11( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x11( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i4 [[TMP3]], [[TMP2]] +; CHECK-NEXT: ret i4 [[TMP10]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -163,10 +210,14 @@ define i4 @foo4x11(i4 %x) { define i4 @foo4x12(i4 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo4x12( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x12( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP3]], [[TMP3]] +; CHECK-NEXT: ret i4 [[TMP2]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -183,11 +234,15 @@ define i4 @foo4x12(i4 %x) { define i4 @foo4x13(i4 %x) { ; Can be done with three multiplies. -; CHECK-LABEL: @foo4x13( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x13( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i4 [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret i4 [[TMP12]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -205,11 +260,15 @@ define i4 @foo4x13(i4 %x) { define i4 @foo4x14(i4 %x) { ; Can be done with three multiplies. -; CHECK-LABEL: @foo4x14( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x14( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i4 [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i4 [[TMP5]], [[X]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i4 [[TMP6]], [[TMP6]] +; CHECK-NEXT: ret i4 [[TMP7]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -228,12 +287,16 @@ define i4 @foo4x14(i4 %x) { define i4 @foo4x15(i4 %x) { ; Can be done with four multiplies. -; CHECK-LABEL: @foo4x15( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x15( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i4 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i4 [[TMP6]], [[X]] +; CHECK-NEXT: [[TMP14:%.*]] = mul i4 [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret i4 [[TMP14]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x From 718ba5a58452f013f40fab94f967064919bf13ff Mon Sep 17 00:00:00 2001 From: AtariDreams Date: Sun, 26 May 2024 17:28:29 -0400 Subject: [PATCH 098/230] Reapply [InstCombine] lshr (mul (X, 2^N + 1)), N -> add (X, lshr(X, N)) (#92907) Alive2 Proofs: https://alive2.llvm.org/ce/z/eSinJY https://alive2.llvm.org/ce/z/vyKvde https://alive2.llvm.org/ce/z/dRFsfV I mistakenly reverted this commit as part of a larger set of reverts. Reapplied without changes. --- .../InstCombine/InstCombineShifts.cpp | 50 +++- llvm/test/Transforms/InstCombine/ashr-lshr.ll | 259 ++++++++++++++++++ llvm/test/Transforms/InstCombine/lshr.ll | 19 +- 3 files changed, 318 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 0f1979fbe0c769..4f91993750fd27 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -1461,13 +1461,24 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { const APInt *MulC; if (match(Op0, m_NUWMul(m_Value(X), m_APInt(MulC)))) { - // Look for a "splat" mul pattern - it replicates bits across each half of - // a value, so a right shift is just a mask of the low bits: - // lshr i[2N] (mul nuw X, (2^N)+1), N --> and iN X, (2^N)-1 - // TODO: Generalize to allow more than just half-width shifts? - if (BitWidth > 2 && ShAmtC * 2 == BitWidth && (*MulC - 1).isPowerOf2() && - MulC->logBase2() == ShAmtC) - return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2)); + if (BitWidth > 2 && (*MulC - 1).isPowerOf2() && + MulC->logBase2() == ShAmtC) { + // Look for a "splat" mul pattern - it replicates bits across each half + // of a value, so a right shift is just a mask of the low bits: + // lshr i[2N] (mul nuw X, (2^N)+1), N --> and iN X, (2^N)-1 + if (ShAmtC * 2 == BitWidth) + return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2)); + + // lshr (mul nuw (X, 2^N + 1)), N -> add nuw (X, lshr(X, N)) + if (Op0->hasOneUse()) { + auto *NewAdd = BinaryOperator::CreateNUWAdd( + X, Builder.CreateLShr(X, ConstantInt::get(Ty, ShAmtC), "", + I.isExact())); + NewAdd->setHasNoSignedWrap( + cast(Op0)->hasNoSignedWrap()); + return NewAdd; + } + } // The one-use check is not strictly necessary, but codegen may not be // able to invert the transform and perf may suffer with an extra mul @@ -1487,6 +1498,16 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { } } + // lshr (mul nsw (X, 2^N + 1)), N -> add nsw (X, lshr(X, N)) + if (match(Op0, m_OneUse(m_NSWMul(m_Value(X), m_APInt(MulC))))) { + if (BitWidth > 2 && (*MulC - 1).isPowerOf2() && + MulC->logBase2() == ShAmtC) { + return BinaryOperator::CreateNSWAdd( + X, Builder.CreateLShr(X, ConstantInt::get(Ty, ShAmtC), "", + I.isExact())); + } + } + // Try to narrow bswap. // In the case where the shift amount equals the bitwidth difference, the // shift is eliminated. @@ -1690,6 +1711,21 @@ Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) { if (match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y))))) return new SExtInst(Builder.CreateICmpSLT(X, Y), Ty); } + + const APInt *MulC; + if (match(Op0, m_OneUse(m_NSWMul(m_Value(X), m_APInt(MulC)))) && + (BitWidth > 2 && (*MulC - 1).isPowerOf2() && + MulC->logBase2() == ShAmt && + (ShAmt < BitWidth - 1))) /* Minus 1 for the sign bit */ { + + // ashr (mul nsw (X, 2^N + 1)), N -> add nsw (X, ashr(X, N)) + auto *NewAdd = BinaryOperator::CreateNSWAdd( + X, + Builder.CreateAShr(X, ConstantInt::get(Ty, ShAmt), "", I.isExact())); + NewAdd->setHasNoUnsignedWrap( + cast(Op0)->hasNoUnsignedWrap()); + return NewAdd; + } } const SimplifyQuery Q = SQ.getWithInstruction(&I); diff --git a/llvm/test/Transforms/InstCombine/ashr-lshr.ll b/llvm/test/Transforms/InstCombine/ashr-lshr.ll index ac206dc7999dd2..c2a4f35412670b 100644 --- a/llvm/test/Transforms/InstCombine/ashr-lshr.ll +++ b/llvm/test/Transforms/InstCombine/ashr-lshr.ll @@ -604,3 +604,262 @@ define <2 x i8> @ashr_known_pos_exact_vec(<2 x i8> %x, <2 x i8> %y) { %r = ashr exact <2 x i8> %p, %y ret <2 x i8> %r } + +define i32 @lshr_mul_times_3_div_2(i32 %0) { +; CHECK-LABEL: @lshr_mul_times_3_div_2( +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 1 +; CHECK-NEXT: [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: ret i32 [[LSHR]] +; + %mul = mul nsw nuw i32 %0, 3 + %lshr = lshr i32 %mul, 1 + ret i32 %lshr +} + +define i32 @lshr_mul_times_3_div_2_exact(i32 %x) { +; CHECK-LABEL: @lshr_mul_times_3_div_2_exact( +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1 +; CHECK-NEXT: [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: ret i32 [[LSHR]] +; + %mul = mul nsw i32 %x, 3 + %lshr = lshr exact i32 %mul, 1 + ret i32 %lshr +} + +; Negative test + +define i32 @lshr_mul_times_3_div_2_no_flags(i32 %0) { +; CHECK-LABEL: @lshr_mul_times_3_div_2_no_flags( +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 3 +; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[MUL]], 1 +; CHECK-NEXT: ret i32 [[LSHR]] +; + %mul = mul i32 %0, 3 + %lshr = lshr i32 %mul, 1 + ret i32 %lshr +} + +; Negative test + +define i32 @mul_times_3_div_2_multiuse_lshr(i32 %x) { +; CHECK-LABEL: @mul_times_3_div_2_multiuse_lshr( +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[MUL]], 1 +; CHECK-NEXT: call void @use(i32 [[MUL]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %mul = mul nuw i32 %x, 3 + %res = lshr i32 %mul, 1 + call void @use(i32 %mul) + ret i32 %res +} + +define i32 @lshr_mul_times_3_div_2_exact_2(i32 %x) { +; CHECK-LABEL: @lshr_mul_times_3_div_2_exact_2( +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1 +; CHECK-NEXT: [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]] +; CHECK-NEXT: ret i32 [[LSHR]] +; + %mul = mul nuw i32 %x, 3 + %lshr = lshr exact i32 %mul, 1 + ret i32 %lshr +} + +define i32 @lshr_mul_times_5_div_4(i32 %0) { +; CHECK-LABEL: @lshr_mul_times_5_div_4( +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 2 +; CHECK-NEXT: [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: ret i32 [[LSHR]] +; + %mul = mul nsw nuw i32 %0, 5 + %lshr = lshr i32 %mul, 2 + ret i32 %lshr +} + +define i32 @lshr_mul_times_5_div_4_exact(i32 %x) { +; CHECK-LABEL: @lshr_mul_times_5_div_4_exact( +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2 +; CHECK-NEXT: [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: ret i32 [[LSHR]] +; + %mul = mul nsw i32 %x, 5 + %lshr = lshr exact i32 %mul, 2 + ret i32 %lshr +} + +; Negative test + +define i32 @lshr_mul_times_5_div_4_no_flags(i32 %0) { +; CHECK-LABEL: @lshr_mul_times_5_div_4_no_flags( +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 5 +; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[MUL]], 2 +; CHECK-NEXT: ret i32 [[LSHR]] +; + %mul = mul i32 %0, 5 + %lshr = lshr i32 %mul, 2 + ret i32 %lshr +} + +; Negative test + +define i32 @mul_times_5_div_4_multiuse_lshr(i32 %x) { +; CHECK-LABEL: @mul_times_5_div_4_multiuse_lshr( +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[X:%.*]], 5 +; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[MUL]], 2 +; CHECK-NEXT: call void @use(i32 [[MUL]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %mul = mul nuw i32 %x, 5 + %res = lshr i32 %mul, 2 + call void @use(i32 %mul) + ret i32 %res +} + +define i32 @lshr_mul_times_5_div_4_exact_2(i32 %x) { +; CHECK-LABEL: @lshr_mul_times_5_div_4_exact_2( +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2 +; CHECK-NEXT: [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]] +; CHECK-NEXT: ret i32 [[LSHR]] +; + %mul = mul nuw i32 %x, 5 + %lshr = lshr exact i32 %mul, 2 + ret i32 %lshr +} + +define i32 @ashr_mul_times_3_div_2(i32 %0) { +; CHECK-LABEL: @ashr_mul_times_3_div_2( +; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 1 +; CHECK-NEXT: [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: ret i32 [[ASHR]] +; + %mul = mul nuw nsw i32 %0, 3 + %ashr = ashr i32 %mul, 1 + ret i32 %ashr +} + +define i32 @ashr_mul_times_3_div_2_exact(i32 %x) { +; CHECK-LABEL: @ashr_mul_times_3_div_2_exact( +; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1 +; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: ret i32 [[ASHR]] +; + %mul = mul nsw i32 %x, 3 + %ashr = ashr exact i32 %mul, 1 + ret i32 %ashr +} + +; Negative test + +define i32 @ashr_mul_times_3_div_2_no_flags(i32 %0) { +; CHECK-LABEL: @ashr_mul_times_3_div_2_no_flags( +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 3 +; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[MUL]], 1 +; CHECK-NEXT: ret i32 [[ASHR]] +; + %mul = mul i32 %0, 3 + %ashr = ashr i32 %mul, 1 + ret i32 %ashr +} + +; Negative test + +define i32 @ashr_mul_times_3_div_2_no_nsw(i32 %0) { +; CHECK-LABEL: @ashr_mul_times_3_div_2_no_nsw( +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[TMP0:%.*]], 3 +; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[MUL]], 1 +; CHECK-NEXT: ret i32 [[ASHR]] +; + %mul = mul nuw i32 %0, 3 + %ashr = ashr i32 %mul, 1 + ret i32 %ashr +} + +; Negative test + +define i32 @mul_times_3_div_2_multiuse_ashr(i32 %x) { +; CHECK-LABEL: @mul_times_3_div_2_multiuse_ashr( +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = ashr i32 [[MUL]], 1 +; CHECK-NEXT: call void @use(i32 [[MUL]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %mul = mul nsw i32 %x, 3 + %res = ashr i32 %mul, 1 + call void @use(i32 %mul) + ret i32 %res +} + +define i32 @ashr_mul_times_3_div_2_exact_2(i32 %x) { +; CHECK-LABEL: @ashr_mul_times_3_div_2_exact_2( +; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1 +; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: ret i32 [[ASHR]] +; + %mul = mul nsw i32 %x, 3 + %ashr = ashr exact i32 %mul, 1 + ret i32 %ashr +} + +define i32 @ashr_mul_times_5_div_4(i32 %0) { +; CHECK-LABEL: @ashr_mul_times_5_div_4( +; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 2 +; CHECK-NEXT: [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: ret i32 [[ASHR]] +; + %mul = mul nuw nsw i32 %0, 5 + %ashr = ashr i32 %mul, 2 + ret i32 %ashr +} + +define i32 @ashr_mul_times_5_div_4_exact(i32 %x) { +; CHECK-LABEL: @ashr_mul_times_5_div_4_exact( +; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2 +; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: ret i32 [[ASHR]] +; + %mul = mul nsw i32 %x, 5 + %ashr = ashr exact i32 %mul, 2 + ret i32 %ashr +} + +; Negative test + +define i32 @ashr_mul_times_5_div_4_no_flags(i32 %0) { +; CHECK-LABEL: @ashr_mul_times_5_div_4_no_flags( +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 5 +; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[MUL]], 2 +; CHECK-NEXT: ret i32 [[ASHR]] +; + %mul = mul i32 %0, 5 + %ashr = ashr i32 %mul, 2 + ret i32 %ashr +} + +; Negative test + +define i32 @mul_times_5_div_4_multiuse_ashr(i32 %x) { +; CHECK-LABEL: @mul_times_5_div_4_multiuse_ashr( +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[X:%.*]], 5 +; CHECK-NEXT: [[RES:%.*]] = ashr i32 [[MUL]], 2 +; CHECK-NEXT: call void @use(i32 [[MUL]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %mul = mul nsw i32 %x, 5 + %res = ashr i32 %mul, 2 + call void @use(i32 %mul) + ret i32 %res +} + +define i32 @ashr_mul_times_5_div_4_exact_2(i32 %x) { +; CHECK-LABEL: @ashr_mul_times_5_div_4_exact_2( +; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2 +; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: ret i32 [[ASHR]] +; + %mul = mul nsw i32 %x, 5 + %ashr = ashr exact i32 %mul, 2 + ret i32 %ashr +} + +declare void @use(i32) diff --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll index fa92c1c4b3be4b..dfdb6c7b4b2689 100644 --- a/llvm/test/Transforms/InstCombine/lshr.ll +++ b/llvm/test/Transforms/InstCombine/lshr.ll @@ -628,12 +628,12 @@ define i32 @mul_splat_fold_wrong_lshr_const(i32 %x) { ret i32 %t } -; Negative test +; Negative test (but simplifies into a different transform) define i32 @mul_splat_fold_no_nuw(i32 %x) { ; CHECK-LABEL: @mul_splat_fold_no_nuw( -; CHECK-NEXT: [[M:%.*]] = mul nsw i32 [[X:%.*]], 65537 -; CHECK-NEXT: [[T:%.*]] = lshr i32 [[M]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 16 +; CHECK-NEXT: [[T:%.*]] = add nsw i32 [[TMP1]], [[X]] ; CHECK-NEXT: ret i32 [[T]] ; %m = mul nsw i32 %x, 65537 @@ -641,6 +641,19 @@ define i32 @mul_splat_fold_no_nuw(i32 %x) { ret i32 %t } +; Negative test + +define i32 @mul_splat_fold_no_flags(i32 %x) { +; CHECK-LABEL: @mul_splat_fold_no_flags( +; CHECK-NEXT: [[M:%.*]] = mul i32 [[X:%.*]], 65537 +; CHECK-NEXT: [[T:%.*]] = lshr i32 [[M]], 16 +; CHECK-NEXT: ret i32 [[T]] +; + %m = mul i32 %x, 65537 + %t = lshr i32 %m, 16 + ret i32 %t +} + ; Negative test (but simplifies before we reach the mul_splat transform)- need more than 2 bits define i2 @mul_splat_fold_too_narrow(i2 %x) { From 6543453c3604c5532666a9bad2bf3d261099dab5 Mon Sep 17 00:00:00 2001 From: Tuan Chuong Goh Date: Wed, 29 May 2024 09:05:09 +0000 Subject: [PATCH 099/230] [AArch64][NFC] Pre-commit test update for Select TBL/TBX instructions (#92914) --- llvm/test/CodeGen/AArch64/arm64-tbl.ll | 28 +++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll index b89232c03f1363..96b2af7274b5bf 100644 --- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll @@ -1,5 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for tbl2_8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl2_16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl3_8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl3_16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl4_8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl4_16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_v8i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_shuffle +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask1 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx2_8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx2_16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx3_8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx3_16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx4_8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx4_16b define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind { ; CHECK-LABEL: tbl1_8b: @@ -571,3 +594,6 @@ declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-GI: {{.*}} +; CHECK-SD: {{.*}} From aef0bdd36d888edd1575713e4976162daf81ff5b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 29 May 2024 12:26:27 +0200 Subject: [PATCH 100/230] DAG: Preserve flags when expanding fminimum/fmaximum (#93550) The operation selection logic here doesn't really work when vector types need to be split. This was also dropping the flags, and losing nnan made the combine from select back to fmin/fmax unrecoverable. Preserve the flags to assist a future commit. --- llvm/include/llvm/CodeGen/SelectionDAG.h | 4 ++-- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 96a62706904686..0dc237301abb48 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1241,11 +1241,11 @@ class SelectionDAG { /// Helper function to make it easier to build Select's if you just have /// operands and don't want to check for vector. SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, - SDValue RHS) { + SDValue RHS, SDNodeFlags Flags = SDNodeFlags()) { assert(LHS.getValueType() == VT && RHS.getValueType() == VT && "Cannot use select on differing types"); auto Opcode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT; - return getNode(Opcode, DL, VT, Cond, LHS, RHS); + return getNode(Opcode, DL, VT, Cond, LHS, RHS, Flags); } /// Helper function to make it easier to build SelectCC's if you just have an diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 4e47f50ee42894..623b6343994a41 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8428,6 +8428,7 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, EVT VT = N->getValueType(0); EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); bool IsMax = Opc == ISD::FMAXIMUM; + SDNodeFlags Flags = N->getFlags(); if (VT.isVector() && isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType())) @@ -8444,15 +8445,15 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, bool MinMaxMustRespectOrderedZero = false; if (isOperationLegalOrCustom(CompOpcIeee, VT)) { - MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS); + MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS, Flags); MinMaxMustRespectOrderedZero = true; } else if (isOperationLegalOrCustom(CompOpc, VT)) { - MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS); + MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags); } else { // NaN (if exists) will be propagated later, so orderness doesn't matter. SDValue Compare = DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT); - MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS); + MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS, Flags); } // Propagate any NaN of both operands @@ -8461,7 +8462,7 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, ConstantFP *FPNaN = ConstantFP::get( *DAG.getContext(), APFloat::getNaN(DAG.EVTToAPFloatSemantics(VT))); MinMax = DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, LHS, RHS, ISD::SETUO), - DAG.getConstantFP(*FPNaN, DL, VT), MinMax); + DAG.getConstantFP(*FPNaN, DL, VT), MinMax, Flags); } // fminimum/fmaximum requires -0.0 less than +0.0 @@ -8473,11 +8474,11 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32); SDValue LCmp = DAG.getSelect( DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS, - MinMax); + MinMax, Flags); SDValue RCmp = DAG.getSelect( DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS, - LCmp); - MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax); + LCmp, Flags); + MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags); } return MinMax; From 9e8ecce88ef65a2953db8071746720dd78bd1632 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 29 May 2024 18:26:54 +0800 Subject: [PATCH 101/230] [DAGCombine] Transform `shl X, cttz(Y)` to `mul (Y & -Y), X` if cttz is unsupported (#85066) This patch fold `shl X, cttz(Y)` to `mul (Y & -Y), X` if cttz is unsupported by the target. Alive2: https://alive2.llvm.org/ce/z/AtLN5Y Fixes https://github.com/llvm/llvm-project/issues/84763. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 12 + llvm/test/CodeGen/RISCV/shl-cttz.ll | 807 ++++++++++++++++++ 2 files changed, 819 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/shl-cttz.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2f4fdf5208d076..42e861e61201c2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10107,6 +10107,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { if (SDValue NewSHL = visitShiftByConstant(N)) return NewSHL; + // fold (shl X, cttz(Y)) -> (mul (Y & -Y), X) if cttz is unsupported on the + // target. + if ((N1.getOpcode() == ISD::CTTZ || N1.getOpcode() == ISD::CTTZ_ZERO_UNDEF) && + N1.hasOneUse() && !TLI.isOperationLegalOrCustom(ISD::CTTZ, VT) && + TLI.isOperationLegalOrCustom(ISD::MUL, VT)) { + SDValue Y = N1.getOperand(0); + SDLoc DL(N); + SDValue NegY = DAG.getNegative(Y, DL, VT); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Y, NegY); + return DAG.getNode(ISD::MUL, DL, VT, And, N0); + } + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll new file mode 100644 index 00000000000000..0eeb8b04c7e5d5 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll @@ -0,0 +1,807 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=riscv32 -mattr=+m < %s \ +; RUN: | FileCheck %s -check-prefix=RV32I +; RUN: llc -mtriple=riscv32 -mattr=+m,+zbb < %s \ +; RUN: | FileCheck %s -check-prefix=RV32ZBB +; RUN: llc -mtriple=riscv64 -mattr=+m < %s \ +; RUN: | FileCheck %s -check-prefixes=RV64I,RV64IILLEGALI32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+zbb < %s \ +; RUN: | FileCheck %s -check-prefixes=RV64ZBB,RV64ZBBILLEGALI32 +; RUN: llc -mtriple=riscv64 -mattr=+m -riscv-experimental-rv64-legal-i32 < %s \ +; RUN: | FileCheck %s -check-prefixes=RV64I,RV64ILEGALI32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+zbb -riscv-experimental-rv64-legal-i32 < %s \ +; RUN: | FileCheck %s -check-prefixes=RV64ZBB,RV64ZBBLEGALI32 + +define i8 @shl_cttz_i8(i8 %x, i8 %y) { +; RV32I-LABEL: shl_cttz_i8: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi a2, a1, -1 +; RV32I-NEXT: not a1, a1 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: srli a2, a1, 1 +; RV32I-NEXT: andi a2, a2, 85 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: andi a2, a1, 51 +; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: andi a1, a1, 51 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: srli a2, a1, 4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: andi a1, a1, 15 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_i8: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: ctz a1, a1 +; RV32ZBB-NEXT: sll a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64IILLEGALI32-LABEL: shl_cttz_i8: +; RV64IILLEGALI32: # %bb.0: # %entry +; RV64IILLEGALI32-NEXT: addi a2, a1, -1 +; RV64IILLEGALI32-NEXT: not a1, a1 +; RV64IILLEGALI32-NEXT: and a1, a1, a2 +; RV64IILLEGALI32-NEXT: srli a2, a1, 1 +; RV64IILLEGALI32-NEXT: andi a2, a2, 85 +; RV64IILLEGALI32-NEXT: subw a1, a1, a2 +; RV64IILLEGALI32-NEXT: andi a2, a1, 51 +; RV64IILLEGALI32-NEXT: srli a1, a1, 2 +; RV64IILLEGALI32-NEXT: andi a1, a1, 51 +; RV64IILLEGALI32-NEXT: add a1, a2, a1 +; RV64IILLEGALI32-NEXT: srli a2, a1, 4 +; RV64IILLEGALI32-NEXT: add a1, a1, a2 +; RV64IILLEGALI32-NEXT: andi a1, a1, 15 +; RV64IILLEGALI32-NEXT: sll a0, a0, a1 +; RV64IILLEGALI32-NEXT: ret +; +; RV64ZBBILLEGALI32-LABEL: shl_cttz_i8: +; RV64ZBBILLEGALI32: # %bb.0: # %entry +; RV64ZBBILLEGALI32-NEXT: ctz a1, a1 +; RV64ZBBILLEGALI32-NEXT: sll a0, a0, a1 +; RV64ZBBILLEGALI32-NEXT: ret +; +; RV64ILEGALI32-LABEL: shl_cttz_i8: +; RV64ILEGALI32: # %bb.0: # %entry +; RV64ILEGALI32-NEXT: addi a2, a1, -1 +; RV64ILEGALI32-NEXT: not a1, a1 +; RV64ILEGALI32-NEXT: and a1, a1, a2 +; RV64ILEGALI32-NEXT: srliw a2, a1, 1 +; RV64ILEGALI32-NEXT: andi a2, a2, 85 +; RV64ILEGALI32-NEXT: subw a1, a1, a2 +; RV64ILEGALI32-NEXT: andi a2, a1, 51 +; RV64ILEGALI32-NEXT: srliw a1, a1, 2 +; RV64ILEGALI32-NEXT: andi a1, a1, 51 +; RV64ILEGALI32-NEXT: add a1, a2, a1 +; RV64ILEGALI32-NEXT: srliw a2, a1, 4 +; RV64ILEGALI32-NEXT: add a1, a1, a2 +; RV64ILEGALI32-NEXT: andi a1, a1, 15 +; RV64ILEGALI32-NEXT: sllw a0, a0, a1 +; RV64ILEGALI32-NEXT: ret +; +; RV64ZBBLEGALI32-LABEL: shl_cttz_i8: +; RV64ZBBLEGALI32: # %bb.0: # %entry +; RV64ZBBLEGALI32-NEXT: ctzw a1, a1 +; RV64ZBBLEGALI32-NEXT: sllw a0, a0, a1 +; RV64ZBBLEGALI32-NEXT: ret +entry: + %cttz = call i8 @llvm.cttz.i8(i8 %y, i1 true) + %res = shl i8 %x, %cttz + ret i8 %res +} + +define i8 @shl_cttz_constant_i8(i8 %y) { +; RV32I-LABEL: shl_cttz_constant_i8: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: not a0, a0 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: andi a1, a1, 85 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: andi a1, a0, 51 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: andi a0, a0, 51 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: andi a0, a0, 15 +; RV32I-NEXT: li a1, 4 +; RV32I-NEXT: sll a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_constant_i8: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: ctz a0, a0 +; RV32ZBB-NEXT: li a1, 4 +; RV32ZBB-NEXT: sll a0, a1, a0 +; RV32ZBB-NEXT: ret +; +; RV64IILLEGALI32-LABEL: shl_cttz_constant_i8: +; RV64IILLEGALI32: # %bb.0: # %entry +; RV64IILLEGALI32-NEXT: addi a1, a0, -1 +; RV64IILLEGALI32-NEXT: not a0, a0 +; RV64IILLEGALI32-NEXT: and a0, a0, a1 +; RV64IILLEGALI32-NEXT: srli a1, a0, 1 +; RV64IILLEGALI32-NEXT: andi a1, a1, 85 +; RV64IILLEGALI32-NEXT: subw a0, a0, a1 +; RV64IILLEGALI32-NEXT: andi a1, a0, 51 +; RV64IILLEGALI32-NEXT: srli a0, a0, 2 +; RV64IILLEGALI32-NEXT: andi a0, a0, 51 +; RV64IILLEGALI32-NEXT: add a0, a1, a0 +; RV64IILLEGALI32-NEXT: srli a1, a0, 4 +; RV64IILLEGALI32-NEXT: add a0, a0, a1 +; RV64IILLEGALI32-NEXT: andi a0, a0, 15 +; RV64IILLEGALI32-NEXT: li a1, 4 +; RV64IILLEGALI32-NEXT: sll a0, a1, a0 +; RV64IILLEGALI32-NEXT: ret +; +; RV64ZBBILLEGALI32-LABEL: shl_cttz_constant_i8: +; RV64ZBBILLEGALI32: # %bb.0: # %entry +; RV64ZBBILLEGALI32-NEXT: ctz a0, a0 +; RV64ZBBILLEGALI32-NEXT: li a1, 4 +; RV64ZBBILLEGALI32-NEXT: sll a0, a1, a0 +; RV64ZBBILLEGALI32-NEXT: ret +; +; RV64ILEGALI32-LABEL: shl_cttz_constant_i8: +; RV64ILEGALI32: # %bb.0: # %entry +; RV64ILEGALI32-NEXT: addi a1, a0, -1 +; RV64ILEGALI32-NEXT: not a0, a0 +; RV64ILEGALI32-NEXT: and a0, a0, a1 +; RV64ILEGALI32-NEXT: srliw a1, a0, 1 +; RV64ILEGALI32-NEXT: andi a1, a1, 85 +; RV64ILEGALI32-NEXT: subw a0, a0, a1 +; RV64ILEGALI32-NEXT: andi a1, a0, 51 +; RV64ILEGALI32-NEXT: srliw a0, a0, 2 +; RV64ILEGALI32-NEXT: andi a0, a0, 51 +; RV64ILEGALI32-NEXT: add a0, a1, a0 +; RV64ILEGALI32-NEXT: srliw a1, a0, 4 +; RV64ILEGALI32-NEXT: add a0, a0, a1 +; RV64ILEGALI32-NEXT: andi a0, a0, 15 +; RV64ILEGALI32-NEXT: li a1, 4 +; RV64ILEGALI32-NEXT: sllw a0, a1, a0 +; RV64ILEGALI32-NEXT: ret +; +; RV64ZBBLEGALI32-LABEL: shl_cttz_constant_i8: +; RV64ZBBLEGALI32: # %bb.0: # %entry +; RV64ZBBLEGALI32-NEXT: ctzw a0, a0 +; RV64ZBBLEGALI32-NEXT: li a1, 4 +; RV64ZBBLEGALI32-NEXT: sllw a0, a1, a0 +; RV64ZBBLEGALI32-NEXT: ret +entry: + %cttz = call i8 @llvm.cttz.i8(i8 %y, i1 true) + %res = shl i8 4, %cttz + ret i8 %res +} + +define i16 @shl_cttz_i16(i16 %x, i16 %y) { +; RV32I-LABEL: shl_cttz_i16: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi a2, a1, -1 +; RV32I-NEXT: not a1, a1 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: srli a2, a1, 1 +; RV32I-NEXT: lui a3, 5 +; RV32I-NEXT: addi a3, a3, 1365 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: lui a2, 3 +; RV32I-NEXT: addi a2, a2, 819 +; RV32I-NEXT: and a3, a1, a2 +; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: srli a2, a1, 4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: andi a2, a1, 15 +; RV32I-NEXT: slli a1, a1, 20 +; RV32I-NEXT: srli a1, a1, 28 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_i16: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: ctz a1, a1 +; RV32ZBB-NEXT: sll a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64IILLEGALI32-LABEL: shl_cttz_i16: +; RV64IILLEGALI32: # %bb.0: # %entry +; RV64IILLEGALI32-NEXT: addi a2, a1, -1 +; RV64IILLEGALI32-NEXT: not a1, a1 +; RV64IILLEGALI32-NEXT: and a1, a1, a2 +; RV64IILLEGALI32-NEXT: srli a2, a1, 1 +; RV64IILLEGALI32-NEXT: lui a3, 5 +; RV64IILLEGALI32-NEXT: addiw a3, a3, 1365 +; RV64IILLEGALI32-NEXT: and a2, a2, a3 +; RV64IILLEGALI32-NEXT: sub a1, a1, a2 +; RV64IILLEGALI32-NEXT: lui a2, 3 +; RV64IILLEGALI32-NEXT: addiw a2, a2, 819 +; RV64IILLEGALI32-NEXT: and a3, a1, a2 +; RV64IILLEGALI32-NEXT: srli a1, a1, 2 +; RV64IILLEGALI32-NEXT: and a1, a1, a2 +; RV64IILLEGALI32-NEXT: add a1, a3, a1 +; RV64IILLEGALI32-NEXT: srli a2, a1, 4 +; RV64IILLEGALI32-NEXT: add a1, a1, a2 +; RV64IILLEGALI32-NEXT: andi a2, a1, 15 +; RV64IILLEGALI32-NEXT: slli a1, a1, 52 +; RV64IILLEGALI32-NEXT: srli a1, a1, 60 +; RV64IILLEGALI32-NEXT: add a1, a2, a1 +; RV64IILLEGALI32-NEXT: sll a0, a0, a1 +; RV64IILLEGALI32-NEXT: ret +; +; RV64ZBBILLEGALI32-LABEL: shl_cttz_i16: +; RV64ZBBILLEGALI32: # %bb.0: # %entry +; RV64ZBBILLEGALI32-NEXT: ctz a1, a1 +; RV64ZBBILLEGALI32-NEXT: sll a0, a0, a1 +; RV64ZBBILLEGALI32-NEXT: ret +; +; RV64ILEGALI32-LABEL: shl_cttz_i16: +; RV64ILEGALI32: # %bb.0: # %entry +; RV64ILEGALI32-NEXT: addi a2, a1, -1 +; RV64ILEGALI32-NEXT: not a1, a1 +; RV64ILEGALI32-NEXT: and a1, a1, a2 +; RV64ILEGALI32-NEXT: srliw a2, a1, 1 +; RV64ILEGALI32-NEXT: lui a3, 5 +; RV64ILEGALI32-NEXT: addi a3, a3, 1365 +; RV64ILEGALI32-NEXT: and a2, a2, a3 +; RV64ILEGALI32-NEXT: subw a1, a1, a2 +; RV64ILEGALI32-NEXT: lui a2, 3 +; RV64ILEGALI32-NEXT: addi a2, a2, 819 +; RV64ILEGALI32-NEXT: and a3, a1, a2 +; RV64ILEGALI32-NEXT: srliw a1, a1, 2 +; RV64ILEGALI32-NEXT: and a1, a1, a2 +; RV64ILEGALI32-NEXT: add a1, a3, a1 +; RV64ILEGALI32-NEXT: srliw a2, a1, 4 +; RV64ILEGALI32-NEXT: add a1, a1, a2 +; RV64ILEGALI32-NEXT: andi a2, a1, 15 +; RV64ILEGALI32-NEXT: slli a1, a1, 52 +; RV64ILEGALI32-NEXT: srli a1, a1, 60 +; RV64ILEGALI32-NEXT: add a1, a2, a1 +; RV64ILEGALI32-NEXT: sllw a0, a0, a1 +; RV64ILEGALI32-NEXT: ret +; +; RV64ZBBLEGALI32-LABEL: shl_cttz_i16: +; RV64ZBBLEGALI32: # %bb.0: # %entry +; RV64ZBBLEGALI32-NEXT: ctzw a1, a1 +; RV64ZBBLEGALI32-NEXT: sllw a0, a0, a1 +; RV64ZBBLEGALI32-NEXT: ret +entry: + %cttz = call i16 @llvm.cttz.i16(i16 %y, i1 true) + %res = shl i16 %x, %cttz + ret i16 %res +} + +define i16 @shl_cttz_constant_i16(i16 %y) { +; RV32I-LABEL: shl_cttz_constant_i16: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: not a0, a0 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a2, 5 +; RV32I-NEXT: addi a2, a2, 1365 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: lui a1, 3 +; RV32I-NEXT: addi a1, a1, 819 +; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: andi a1, a0, 15 +; RV32I-NEXT: slli a0, a0, 20 +; RV32I-NEXT: srli a0, a0, 28 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: li a1, 4 +; RV32I-NEXT: sll a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_constant_i16: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: ctz a0, a0 +; RV32ZBB-NEXT: li a1, 4 +; RV32ZBB-NEXT: sll a0, a1, a0 +; RV32ZBB-NEXT: ret +; +; RV64IILLEGALI32-LABEL: shl_cttz_constant_i16: +; RV64IILLEGALI32: # %bb.0: # %entry +; RV64IILLEGALI32-NEXT: addi a1, a0, -1 +; RV64IILLEGALI32-NEXT: not a0, a0 +; RV64IILLEGALI32-NEXT: and a0, a0, a1 +; RV64IILLEGALI32-NEXT: srli a1, a0, 1 +; RV64IILLEGALI32-NEXT: lui a2, 5 +; RV64IILLEGALI32-NEXT: addiw a2, a2, 1365 +; RV64IILLEGALI32-NEXT: and a1, a1, a2 +; RV64IILLEGALI32-NEXT: sub a0, a0, a1 +; RV64IILLEGALI32-NEXT: lui a1, 3 +; RV64IILLEGALI32-NEXT: addiw a1, a1, 819 +; RV64IILLEGALI32-NEXT: and a2, a0, a1 +; RV64IILLEGALI32-NEXT: srli a0, a0, 2 +; RV64IILLEGALI32-NEXT: and a0, a0, a1 +; RV64IILLEGALI32-NEXT: add a0, a2, a0 +; RV64IILLEGALI32-NEXT: srli a1, a0, 4 +; RV64IILLEGALI32-NEXT: add a0, a0, a1 +; RV64IILLEGALI32-NEXT: andi a1, a0, 15 +; RV64IILLEGALI32-NEXT: slli a0, a0, 52 +; RV64IILLEGALI32-NEXT: srli a0, a0, 60 +; RV64IILLEGALI32-NEXT: add a0, a1, a0 +; RV64IILLEGALI32-NEXT: li a1, 4 +; RV64IILLEGALI32-NEXT: sll a0, a1, a0 +; RV64IILLEGALI32-NEXT: ret +; +; RV64ZBBILLEGALI32-LABEL: shl_cttz_constant_i16: +; RV64ZBBILLEGALI32: # %bb.0: # %entry +; RV64ZBBILLEGALI32-NEXT: ctz a0, a0 +; RV64ZBBILLEGALI32-NEXT: li a1, 4 +; RV64ZBBILLEGALI32-NEXT: sll a0, a1, a0 +; RV64ZBBILLEGALI32-NEXT: ret +; +; RV64ILEGALI32-LABEL: shl_cttz_constant_i16: +; RV64ILEGALI32: # %bb.0: # %entry +; RV64ILEGALI32-NEXT: addi a1, a0, -1 +; RV64ILEGALI32-NEXT: not a0, a0 +; RV64ILEGALI32-NEXT: and a0, a0, a1 +; RV64ILEGALI32-NEXT: srliw a1, a0, 1 +; RV64ILEGALI32-NEXT: lui a2, 5 +; RV64ILEGALI32-NEXT: addi a2, a2, 1365 +; RV64ILEGALI32-NEXT: and a1, a1, a2 +; RV64ILEGALI32-NEXT: subw a0, a0, a1 +; RV64ILEGALI32-NEXT: lui a1, 3 +; RV64ILEGALI32-NEXT: addi a1, a1, 819 +; RV64ILEGALI32-NEXT: and a2, a0, a1 +; RV64ILEGALI32-NEXT: srliw a0, a0, 2 +; RV64ILEGALI32-NEXT: and a0, a0, a1 +; RV64ILEGALI32-NEXT: add a0, a2, a0 +; RV64ILEGALI32-NEXT: srliw a1, a0, 4 +; RV64ILEGALI32-NEXT: add a0, a0, a1 +; RV64ILEGALI32-NEXT: andi a1, a0, 15 +; RV64ILEGALI32-NEXT: slli a0, a0, 52 +; RV64ILEGALI32-NEXT: srli a0, a0, 60 +; RV64ILEGALI32-NEXT: add a0, a1, a0 +; RV64ILEGALI32-NEXT: li a1, 4 +; RV64ILEGALI32-NEXT: sllw a0, a1, a0 +; RV64ILEGALI32-NEXT: ret +; +; RV64ZBBLEGALI32-LABEL: shl_cttz_constant_i16: +; RV64ZBBLEGALI32: # %bb.0: # %entry +; RV64ZBBLEGALI32-NEXT: ctzw a0, a0 +; RV64ZBBLEGALI32-NEXT: li a1, 4 +; RV64ZBBLEGALI32-NEXT: sllw a0, a1, a0 +; RV64ZBBLEGALI32-NEXT: ret +entry: + %cttz = call i16 @llvm.cttz.i16(i16 %y, i1 true) + %res = shl i16 4, %cttz + ret i16 %res +} + +define i32 @shl_cttz_i32(i32 %x, i32 %y) { +; RV32I-LABEL: shl_cttz_i32: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: neg a2, a1 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: mul a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_i32: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: ctz a1, a1 +; RV32ZBB-NEXT: sll a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: shl_cttz_i32: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 30667 +; RV64I-NEXT: addi a2, a2, 1329 +; RV64I-NEXT: mul a1, a1, a2 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI4_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI4_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: sllw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: shl_cttz_i32: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: ctzw a1, a1 +; RV64ZBB-NEXT: sllw a0, a0, a1 +; RV64ZBB-NEXT: ret +entry: + %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 true) + %res = shl i32 %x, %cttz + ret i32 %res +} + +define i32 @shl_cttz_i32_zero_is_defined(i32 %x, i32 %y) { +; RV32I-LABEL: shl_cttz_i32_zero_is_defined: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: beqz a1, .LBB5_2 +; RV32I-NEXT: # %bb.1: # %cond.false +; RV32I-NEXT: neg a2, a1 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 30667 +; RV32I-NEXT: addi a2, a2, 1329 +; RV32I-NEXT: mul a1, a1, a2 +; RV32I-NEXT: srli a1, a1, 27 +; RV32I-NEXT: lui a2, %hi(.LCPI5_0) +; RV32I-NEXT: addi a2, a2, %lo(.LCPI5_0) +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB5_2: +; RV32I-NEXT: li a1, 32 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_i32_zero_is_defined: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: ctz a1, a1 +; RV32ZBB-NEXT: sll a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: shl_cttz_i32_zero_is_defined: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: beqz a2, .LBB5_2 +; RV64I-NEXT: # %bb.1: # %cond.false +; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 30667 +; RV64I-NEXT: addi a2, a2, 1329 +; RV64I-NEXT: mul a1, a1, a2 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI5_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI5_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: sllw a0, a0, a1 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB5_2: +; RV64I-NEXT: li a1, 32 +; RV64I-NEXT: sllw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: shl_cttz_i32_zero_is_defined: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: ctzw a1, a1 +; RV64ZBB-NEXT: sllw a0, a0, a1 +; RV64ZBB-NEXT: ret +entry: + %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false) + %res = shl i32 %x, %cttz + ret i32 %res +} + +define i32 @shl_cttz_constant_i32(i32 %y) { +; RV32I-LABEL: shl_cttz_constant_i32: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_constant_i32: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: ctz a0, a0 +; RV32ZBB-NEXT: li a1, 4 +; RV32ZBB-NEXT: sll a0, a1, a0 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: shl_cttz_constant_i32: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: lui a1, 30667 +; RV64I-NEXT: addi a1, a1, 1329 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: srliw a0, a0, 27 +; RV64I-NEXT: lui a1, %hi(.LCPI6_0) +; RV64I-NEXT: addi a1, a1, %lo(.LCPI6_0) +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: li a1, 4 +; RV64I-NEXT: sllw a0, a1, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: shl_cttz_constant_i32: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: ctzw a0, a0 +; RV64ZBB-NEXT: li a1, 4 +; RV64ZBB-NEXT: sllw a0, a1, a0 +; RV64ZBB-NEXT: ret +entry: + %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 true) + %res = shl i32 4, %cttz + ret i32 %res +} + +define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) { +; RV32I-LABEL: shl_cttz_multiuse_i32: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: neg a2, a1 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 30667 +; RV32I-NEXT: addi a2, a2, 1329 +; RV32I-NEXT: mul a1, a1, a2 +; RV32I-NEXT: srli a1, a1, 27 +; RV32I-NEXT: lui a2, %hi(.LCPI7_0) +; RV32I-NEXT: addi a2, a2, %lo(.LCPI7_0) +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: lbu s0, 0(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call use32 +; RV32I-NEXT: sll a0, s1, s0 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_multiuse_i32: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: addi sp, sp, -16 +; RV32ZBB-NEXT: .cfi_def_cfa_offset 16 +; RV32ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32ZBB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32ZBB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32ZBB-NEXT: .cfi_offset ra, -4 +; RV32ZBB-NEXT: .cfi_offset s0, -8 +; RV32ZBB-NEXT: .cfi_offset s1, -12 +; RV32ZBB-NEXT: mv s0, a0 +; RV32ZBB-NEXT: ctz s1, a1 +; RV32ZBB-NEXT: mv a0, s1 +; RV32ZBB-NEXT: call use32 +; RV32ZBB-NEXT: sll a0, s0, s1 +; RV32ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32ZBB-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32ZBB-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32ZBB-NEXT: addi sp, sp, 16 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: shl_cttz_multiuse_i32: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: .cfi_def_cfa_offset 32 +; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 30667 +; RV64I-NEXT: addi a2, a2, 1329 +; RV64I-NEXT: mul a1, a1, a2 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI7_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI7_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu s0, 0(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call use32 +; RV64I-NEXT: sllw a0, s1, s0 +; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 32 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: shl_cttz_multiuse_i32: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: addi sp, sp, -32 +; RV64ZBB-NEXT: .cfi_def_cfa_offset 32 +; RV64ZBB-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64ZBB-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64ZBB-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64ZBB-NEXT: .cfi_offset ra, -8 +; RV64ZBB-NEXT: .cfi_offset s0, -16 +; RV64ZBB-NEXT: .cfi_offset s1, -24 +; RV64ZBB-NEXT: mv s0, a0 +; RV64ZBB-NEXT: ctzw s1, a1 +; RV64ZBB-NEXT: mv a0, s1 +; RV64ZBB-NEXT: call use32 +; RV64ZBB-NEXT: sllw a0, s0, s1 +; RV64ZBB-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64ZBB-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64ZBB-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64ZBB-NEXT: addi sp, sp, 32 +; RV64ZBB-NEXT: ret +entry: + %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 true) + call void @use32(i32 %cttz) + %res = shl i32 %x, %cttz + ret i32 %res +} + +define i64 @shl_cttz_i64(i64 %x, i64 %y) { +; RV32I-LABEL: shl_cttz_i64: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a4, 30667 +; RV32I-NEXT: addi a5, a4, 1329 +; RV32I-NEXT: lui a4, %hi(.LCPI8_0) +; RV32I-NEXT: addi a4, a4, %lo(.LCPI8_0) +; RV32I-NEXT: bnez a2, .LBB8_2 +; RV32I-NEXT: # %bb.1: # %entry +; RV32I-NEXT: neg a2, a3 +; RV32I-NEXT: and a2, a3, a2 +; RV32I-NEXT: mul a2, a2, a5 +; RV32I-NEXT: srli a2, a2, 27 +; RV32I-NEXT: add a2, a4, a2 +; RV32I-NEXT: lbu a2, 0(a2) +; RV32I-NEXT: addi a4, a2, 32 +; RV32I-NEXT: j .LBB8_3 +; RV32I-NEXT: .LBB8_2: +; RV32I-NEXT: neg a3, a2 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: mul a2, a2, a5 +; RV32I-NEXT: srli a2, a2, 27 +; RV32I-NEXT: add a2, a4, a2 +; RV32I-NEXT: lbu a4, 0(a2) +; RV32I-NEXT: .LBB8_3: # %entry +; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: sll a2, a0, a4 +; RV32I-NEXT: bltz a3, .LBB8_5 +; RV32I-NEXT: # %bb.4: # %entry +; RV32I-NEXT: mv a1, a2 +; RV32I-NEXT: j .LBB8_6 +; RV32I-NEXT: .LBB8_5: +; RV32I-NEXT: sll a1, a1, a4 +; RV32I-NEXT: not a4, a4 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: srl a0, a0, a4 +; RV32I-NEXT: or a1, a1, a0 +; RV32I-NEXT: .LBB8_6: # %entry +; RV32I-NEXT: srai a0, a3, 31 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_i64: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: bnez a2, .LBB8_2 +; RV32ZBB-NEXT: # %bb.1: # %entry +; RV32ZBB-NEXT: ctz a2, a3 +; RV32ZBB-NEXT: addi a4, a2, 32 +; RV32ZBB-NEXT: j .LBB8_3 +; RV32ZBB-NEXT: .LBB8_2: +; RV32ZBB-NEXT: ctz a4, a2 +; RV32ZBB-NEXT: .LBB8_3: # %entry +; RV32ZBB-NEXT: addi a3, a4, -32 +; RV32ZBB-NEXT: sll a2, a0, a4 +; RV32ZBB-NEXT: bltz a3, .LBB8_5 +; RV32ZBB-NEXT: # %bb.4: # %entry +; RV32ZBB-NEXT: mv a1, a2 +; RV32ZBB-NEXT: j .LBB8_6 +; RV32ZBB-NEXT: .LBB8_5: +; RV32ZBB-NEXT: sll a1, a1, a4 +; RV32ZBB-NEXT: not a4, a4 +; RV32ZBB-NEXT: srli a0, a0, 1 +; RV32ZBB-NEXT: srl a0, a0, a4 +; RV32ZBB-NEXT: or a1, a1, a0 +; RV32ZBB-NEXT: .LBB8_6: # %entry +; RV32ZBB-NEXT: srai a0, a3, 31 +; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: shl_cttz_i64: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: mul a0, a1, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: shl_cttz_i64: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: ctz a1, a1 +; RV64ZBB-NEXT: sll a0, a0, a1 +; RV64ZBB-NEXT: ret +entry: + %cttz = call i64 @llvm.cttz.i64(i64 %y, i1 true) + %res = shl i64 %x, %cttz + ret i64 %res +} + +define i64 @shl_cttz_constant_i64(i64 %y) { +; RV32I-LABEL: shl_cttz_constant_i64: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a2, 30667 +; RV32I-NEXT: addi a3, a2, 1329 +; RV32I-NEXT: lui a2, %hi(.LCPI9_0) +; RV32I-NEXT: addi a2, a2, %lo(.LCPI9_0) +; RV32I-NEXT: bnez a0, .LBB9_2 +; RV32I-NEXT: # %bb.1: # %entry +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: and a0, a1, a0 +; RV32I-NEXT: mul a0, a0, a3 +; RV32I-NEXT: srli a0, a0, 27 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: addi a1, a0, 32 +; RV32I-NEXT: j .LBB9_3 +; RV32I-NEXT: .LBB9_2: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: mul a0, a0, a3 +; RV32I-NEXT: srli a0, a0, 27 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: .LBB9_3: # %entry +; RV32I-NEXT: li a0, 4 +; RV32I-NEXT: addi a2, a1, -32 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: bltz a2, .LBB9_5 +; RV32I-NEXT: # %bb.4: # %entry +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: j .LBB9_6 +; RV32I-NEXT: .LBB9_5: +; RV32I-NEXT: not a1, a1 +; RV32I-NEXT: li a3, 2 +; RV32I-NEXT: srl a1, a3, a1 +; RV32I-NEXT: .LBB9_6: # %entry +; RV32I-NEXT: srai a2, a2, 31 +; RV32I-NEXT: and a0, a2, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: shl_cttz_constant_i64: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: bnez a0, .LBB9_2 +; RV32ZBB-NEXT: # %bb.1: # %entry +; RV32ZBB-NEXT: ctz a0, a1 +; RV32ZBB-NEXT: addi a1, a0, 32 +; RV32ZBB-NEXT: j .LBB9_3 +; RV32ZBB-NEXT: .LBB9_2: +; RV32ZBB-NEXT: ctz a1, a0 +; RV32ZBB-NEXT: .LBB9_3: # %entry +; RV32ZBB-NEXT: li a0, 4 +; RV32ZBB-NEXT: addi a2, a1, -32 +; RV32ZBB-NEXT: sll a0, a0, a1 +; RV32ZBB-NEXT: bltz a2, .LBB9_5 +; RV32ZBB-NEXT: # %bb.4: # %entry +; RV32ZBB-NEXT: mv a1, a0 +; RV32ZBB-NEXT: j .LBB9_6 +; RV32ZBB-NEXT: .LBB9_5: +; RV32ZBB-NEXT: not a1, a1 +; RV32ZBB-NEXT: li a3, 2 +; RV32ZBB-NEXT: srl a1, a3, a1 +; RV32ZBB-NEXT: .LBB9_6: # %entry +; RV32ZBB-NEXT: srai a2, a2, 31 +; RV32ZBB-NEXT: and a0, a2, a0 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: shl_cttz_constant_i64: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: shl_cttz_constant_i64: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: ctz a0, a0 +; RV64ZBB-NEXT: li a1, 4 +; RV64ZBB-NEXT: sll a0, a1, a0 +; RV64ZBB-NEXT: ret +entry: + %cttz = call i64 @llvm.cttz.i64(i64 %y, i1 true) + %res = shl i64 4, %cttz + ret i64 %res +} + +declare void @use32(i32 signext) From 23a09b99313edb67d267a974be6cebfdfd97c7c8 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 29 May 2024 10:25:19 +0000 Subject: [PATCH 102/230] [lldb][Test] Remove some xfails for AArch64 Linux PR #92245 fixed these tests on Linux. They likely work on FreeBSD too but leaving the xfail for that so it can be confirmed later. Also updated a bugzilla link to one that redirects to Github issues. Relates to issues #43398 and #48751. --- lldb/test/API/commands/expression/fixits/TestFixIts.py | 3 +-- .../expression/static-initializers/TestStaticInitializers.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lldb/test/API/commands/expression/fixits/TestFixIts.py b/lldb/test/API/commands/expression/fixits/TestFixIts.py index bc53b72fe611b9..1b22ed1c0077c4 100644 --- a/lldb/test/API/commands/expression/fixits/TestFixIts.py +++ b/lldb/test/API/commands/expression/fixits/TestFixIts.py @@ -106,9 +106,8 @@ def test_with_target_error_applies_fixit(self): ) self.assertIn("null_pointer->first", ret_val.GetError()) - # The final function call runs into SIGILL on aarch64-linux. @expectedFailureAll( - archs=["aarch64"], oslist=["freebsd", "linux"], bugnumber="llvm.org/pr49407" + archs=["aarch64"], oslist=["freebsd"], bugnumber="llvm.org/pr49407" ) def test_with_multiple_retries(self): """Test calling expressions with errors that can be fixed by the FixIts.""" diff --git a/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py b/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py index 5fc37ac6a5818a..ea3aa6a4608c41 100644 --- a/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py +++ b/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py @@ -7,8 +7,8 @@ class StaticInitializers(TestBase): @expectedFailureAll( archs="aarch64", - oslist=["freebsd", "linux"], - bugnumber="https://bugs.llvm.org/show_bug.cgi?id=44053", + oslist=["freebsd"], + bugnumber="llvm.org/pr44053", ) def test(self): """Test a static initializer.""" From e93799f260e881ff2f7c0fd7afc78374d615d70e Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 29 May 2024 11:34:24 +0100 Subject: [PATCH 103/230] [SME] Add intrinsics for FCVT(wid.) and FCVTL (#93202) According to the specification in https://github.com/ARM-software/acle/pull/309 this adds the intrinsics ``` svfloat32x2_t svcvt_f32[_f16_x2](svfloat16_t zn) __arm_streaming; svfloat32x2_t svcvtl_f32[_f16_x2](svfloat16_t zn) __arm_streaming; ``` These are available only if __ARM_FEATURE_SME_F16F16 is enabled. --------- Co-authored-by: Caroline Concatto --- clang/include/clang/Basic/arm_sve.td | 11 +++++ .../aarch64-sme2-intrinsics/acle_sme2_cvt.c | 22 ++++++++++ .../aarch64-sme2-intrinsics/acle_sme2_cvtl.c | 40 +++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 14 ++++++- .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 6 +++ .../CodeGen/AArch64/sme2-intrinsics-cvt.ll | 11 ++++- .../CodeGen/AArch64/sme2-intrinsics-cvtl.ll | 11 +++++ 7 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 078ef576342a7c..88938a981fd8ae 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2270,6 +2270,10 @@ let TargetGuard = "sme2" in { def SVCVT_S32_F32_X4 : SInst<"svcvt_{d}[_f32_x4]", "4.d4.M", "i", MergeNone, "aarch64_sve_fcvtzs_x4", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>; } +let TargetGuard = "sme-f16f16" in { + def SVCVT_F32_X2 : SInst<"svcvt_{d}[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvt_widen_x2", [ IsStreaming],[]>; +} + // // Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16 // @@ -2278,6 +2282,13 @@ let TargetGuard = "sme2" in { def SVCVTN_BF16_X2 : SInst<"svcvtn_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvtn_x2", [IsOverloadNone, IsStreaming],[]>; } +// +//Multi-vector floating-point convert from half-precision to deinterleaved single-precision. +// +let TargetGuard = "sme-f16f16" in { + def SVCVTL_F32_X2 : SInst<"svcvtl_f32[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvtl_widen_x2", [ IsStreaming],[]>; +} + // // Multi-vector saturating extract narrow // diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c index 4a5ee7e021f748..e26499d3a63cc4 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c @@ -497,3 +497,25 @@ svuint8_t test_qcvt_u8_s32_x4(svint32x4_t zn) __arm_streaming { svuint16_t test_qcvt_u16_s64_x4(svint64x4_t zn) __arm_streaming { return SVE_ACLE_FUNC(svqcvt_u16,_s64_x4,,)(zn); } + +// CHECK-LABEL: @test_cvt_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z15test_cvt_f32_x2u13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +__attribute__((target("sme-f16f16"))) svfloat32x2_t test_cvt_f32_x2(svfloat16_t zn) __arm_streaming { + return SVE_ACLE_FUNC(svcvt_f32,_f16_x2,,)(zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c new file mode 100644 index 00000000000000..453dd3db6adf09 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c @@ -0,0 +1,40 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_cvtl_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_cvtl_f32_x2u13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_cvtl_f32_x2(svfloat16_t zn) __arm_streaming { + return SVE_ACLE_FUNC(svcvtl_f32,_f16_x2,,)(zn); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 57d0dfb698b383..f2028f8e8fd05a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3121,6 +3121,11 @@ let TargetPrefix = "aarch64" in { : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty], [llvm_nxv4f32_ty, llvm_nxv4f32_ty], [IntrNoMem]>; + + class SME2_CVT_WIDENING_VG2_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMSubdivide2VectorType<0>], [IntrNoMem]>; + class SME2_CVT_VG4_SINGLE_Intrinsic : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>], @@ -3412,6 +3417,13 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_suvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; def int_aarch64_sme_usvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; + + // + //Multi-vector floating-point convert from half-precision to deinterleaved single-precision. + // + + def int_aarch64_sve_fcvtl_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic; + // // Multi-vector floating-point CVT from single-precision to interleaved half-precision/BFloat16 // @@ -3431,7 +3443,7 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic; def int_aarch64_sve_scvtf_x4 : SME2_CVT_X4_Intrinsic; def int_aarch64_sve_ucvtf_x4 : SME2_CVT_X4_Intrinsic; - + def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic; // // Multi-vector saturating extract narrow // diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 660675cf8f3895..8fd58f4698d280 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5717,6 +5717,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::aarch64_sve_ucvtf_x4: SelectCVTIntrinsic(Node, 4, AArch64::UCVTF_4Z4Z_StoS); return; + case Intrinsic::aarch64_sve_fcvt_widen_x2: + SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVT_2ZZ_H_S); + return; + case Intrinsic::aarch64_sve_fcvtl_widen_x2: + SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVTL_2ZZ_H_S); + return; case Intrinsic::aarch64_sve_sclamp_single_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll index bc1db878cbd313..611cdcda157e21 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s ; ; FCVT @@ -139,6 +139,15 @@ define {, ,, , ,, } %res } +define {, } @multi_vector_cvt_widen_x2_f16( %zn0) { +; CHECK-LABEL: multi_vector_cvt_widen_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvt { z0.s, z1.s }, z0.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32( %zn0) + ret {, } %res +} + declare @llvm.aarch64.sve.fcvt.x2.nxv4f32(, ) declare @llvm.aarch64.sve.bfcvt.x2(, ) declare {, } @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(,) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll new file mode 100644 index 00000000000000..30dc7cbfaea6c9 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll @@ -0,0 +1,11 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s + +define {, } @multi_vector_cvtl_widen_x2_f16( %zn0) { +; CHECK-LABEL: multi_vector_cvtl_widen_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtl { z0.s, z1.s }, z0.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32( %zn0) + ret {, } %res +} From 1e44a9690915e8acf7b2a0e67b56aaf4509e9257 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 29 May 2024 11:35:21 +0100 Subject: [PATCH 104/230] [AArch64][SME] Add intrinsics for vector groups ZERO (#93201) According to the specification in https://github.com/ARM-software/acle/pull/309 this adds the intrinsics: void svzero_za64_vg1x2(uint32_t slice) __arm_streaming __arm_inout("za"); void svzero_za64_vg1x4(uint32_t slice) __arm_streaming __arm_inout("za"); void svzero_za64_vg2x1(uint32_t slice) __arm_streaming __arm_inout("za"); void svzero_za64_vg2x2(uint32_t slice) __arm_streaming __arm_inout("za"); void svzero_za64_vg2x4(uint32_t slice) __arm_streaming __arm_inout("za"); void svzero_za64_vg4x1(uint32_t slice) __arm_streaming __arm_inout("za"); void svzero_za64_vg4x2(uint32_t slice) __arm_streaming __arm_inout("za"); void svzero_za64_vg4x4(uint32_t slice) __arm_streaming __arm_inout("za"); --- clang/include/clang/Basic/arm_sme.td | 19 ++ .../acle_sme2p1_zero.c | 139 +++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 6 + llvm/lib/Target/AArch64/SMEInstrFormats.td | 46 ++++- .../CodeGen/AArch64/sme2p1-intrinsics-zero.ll | 190 ++++++++++++++++++ 5 files changed, 391 insertions(+), 9 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c create mode 100644 llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 80e635e4a57eca..564a58e4eb6709 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -146,6 +146,25 @@ let TargetGuard = "sme" in { [IsOverloadNone, IsStreamingCompatible, IsOutZA]>; } +let TargetGuard = "sme2p1" in { + def SVZERO_ZA64_VG1x2 : SInst<"svzero_za64_vg1x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg1x2", + [IsOverloadNone, IsStreaming, IsInOutZA]>; + def SVZERO_ZA64_VG1x4 : SInst<"svzero_za64_vg1x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg1x4", + [IsOverloadNone, IsStreaming, IsInOutZA]>; + def SVZERO_ZA64_VG2x1 : SInst<"svzero_za64_vg2x1", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg2x1", + [IsOverloadNone, IsStreaming, IsInOutZA]>; + def SVZERO_ZA64_VG2x2 : SInst<"svzero_za64_vg2x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg2x2", + [IsOverloadNone, IsStreaming, IsInOutZA]>; + def SVZERO_ZA64_VG2x4 : SInst<"svzero_za64_vg2x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg2x4", + [IsOverloadNone, IsStreaming, IsInOutZA]>; + def SVZERO_ZA64_VG4x1 : SInst<"svzero_za64_vg4x1", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg4x1", + [IsOverloadNone, IsStreaming, IsInOutZA]>; + def SVZERO_ZA64_VG4x2 : SInst<"svzero_za64_vg4x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg4x2", + [IsOverloadNone, IsStreaming, IsInOutZA]>; + def SVZERO_ZA64_VG4x4 : SInst<"svzero_za64_vg4x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg4x4", + [IsOverloadNone, IsStreaming, IsInOutZA]>; +} + //////////////////////////////////////////////////////////////////////////////// // SME - Counting elements in a streaming vector diff --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c new file mode 100644 index 00000000000000..2ad2044c267ed0 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c @@ -0,0 +1,139 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#define SVE_ACLE_FUNC(A1,A2) A1##A2 + +// CHECK-LABEL: define dso_local void @test_svzero_za64_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 [[SLICE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg1x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 [[SLICE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za64_vg1x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + SVE_ACLE_FUNC(svzero_za64,_vg1x2)(slice); +} + +// CHECK-LABEL: define dso_local void @test_svzero_za64_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 [[SLICE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg1x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 [[SLICE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za64_vg1x4(uint32_t slice) __arm_streaming __arm_inout("za"){ + SVE_ACLE_FUNC(svzero_za64,_vg1x4)(slice); +} + +// CHECK-LABEL: define dso_local void @test_svzero_za64_vg2x1( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 [[SLICE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg2x1j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 [[SLICE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za64_vg2x1(uint32_t slice) __arm_streaming __arm_inout("za"){ + SVE_ACLE_FUNC(svzero_za64,_vg2x1)(slice); +} + +// CHECK-LABEL: define dso_local void @test_svzero_za64_vg2x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 [[SLICE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg2x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 [[SLICE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za64_vg2x2(uint32_t slice) __arm_streaming __arm_inout("za"){ + SVE_ACLE_FUNC(svzero_za64,_vg2x2)(slice); +} + +// CHECK-LABEL: define dso_local void @test_svzero_za64_vg2x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 [[SLICE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg2x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 [[SLICE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za64_vg2x4(uint32_t slice) __arm_streaming __arm_inout("za"){ + SVE_ACLE_FUNC(svzero_za64,_vg2x4)(slice); +} + +// CHECK-LABEL: define dso_local void @test_svzero_za64_vg4x1( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 [[SLICE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg4x1j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 [[SLICE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za64_vg4x1(uint32_t slice) __arm_streaming __arm_inout("za"){ + SVE_ACLE_FUNC(svzero_za64,_vg4x1)(slice); +} + +// CHECK-LABEL: define dso_local void @test_svzero_za64_vg4x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 [[SLICE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg4x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 [[SLICE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za64_vg4x2(uint32_t slice) __arm_streaming __arm_inout("za"){ + SVE_ACLE_FUNC(svzero_za64,_vg4x2)(slice); +} + +// CHECK-LABEL: define dso_local void @test_svzero_za64_vg4x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 [[SLICE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg4x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 [[SLICE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za64_vg4x4(uint32_t slice) __arm_streaming __arm_inout("za"){ + SVE_ACLE_FUNC(svzero_za64,_vg4x4)(slice); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index f2028f8e8fd05a..9a71aaa9f44349 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3361,6 +3361,12 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic; def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic; + // Multi-vector zeroing + + foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in { + def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects]>; + } + // Multi-vector signed saturating doubling multiply high def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 50ee37b0dfebc8..b21b1faf5c9622 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -104,6 +104,13 @@ class sme2_move_to_tile_pseudo + : SMEPseudo2Instr, + Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, index_ty:$imm), []> { + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; +} + //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -189,6 +196,9 @@ class SME2_Tile_VG4_Multi_Pat(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>; +class SME2_Zero_Matrix_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset))), + (!cast(name) $base, $offset)>; //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -4815,39 +4825,57 @@ class sme2p1_zero_matrix opc, Operand index_ty, string mnemonic, } multiclass sme2p1_zero_matrix { - def _VG2_Z : sme2p1_zero_matrix<{0b000,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx2"> { + def _VG2_Z : sme2p1_zero_matrix<{0b000,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx2">, SMEPseudo2Instr { bits<3> imm; let Inst{2-0} = imm; } - def _2Z : sme2p1_zero_matrix<{0b001,?,?,?}, uimm3s2range, mnemonic> { + def _2Z : sme2p1_zero_matrix<{0b001,?,?,?}, uimm3s2range, mnemonic>, SMEPseudo2Instr { bits<3> imm; let Inst{2-0} = imm; } - def _VG2_2Z : sme2p1_zero_matrix<{0b0100,?,?}, uimm2s2range, mnemonic, "vgx2"> { + def _VG2_2Z : sme2p1_zero_matrix<{0b0100,?,?}, uimm2s2range, mnemonic, "vgx2">, SMEPseudo2Instr { bits<2> imm; let Inst{1-0} = imm; } - def _VG4_2Z : sme2p1_zero_matrix<{0b0110,?,?}, uimm2s2range, mnemonic, "vgx4"> { + def _VG4_2Z : sme2p1_zero_matrix<{0b0110,?,?}, uimm2s2range, mnemonic, "vgx4">, SMEPseudo2Instr { bits<2> imm; let Inst{1-0} = imm; } - def _VG4_Z : sme2p1_zero_matrix<{0b100,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx4"> { + def _VG4_Z : sme2p1_zero_matrix<{0b100,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx4">, SMEPseudo2Instr { bits<3> imm; let Inst{2-0} = imm; } - def _4Z : sme2p1_zero_matrix<{0b1010,?,?}, uimm2s4range, mnemonic> { + def _4Z : sme2p1_zero_matrix<{0b1010,?,?}, uimm2s4range, mnemonic>, SMEPseudo2Instr { bits<2> imm; let Inst{1-0} = imm; } - def _VG2_4Z :sme2p1_zero_matrix<{0b11000,?}, uimm1s4range, mnemonic, "vgx2"> { + def _VG2_4Z : sme2p1_zero_matrix<{0b11000,?}, uimm1s4range, mnemonic, "vgx2">, SMEPseudo2Instr { bits<1> imm; let Inst{0} = imm; } - def _VG4_4Z :sme2p1_zero_matrix<{0b11100,?}, uimm1s4range, mnemonic, "vgx4"> { + def _VG4_4Z : sme2p1_zero_matrix<{0b11100,?}, uimm1s4range, mnemonic, "vgx4">, SMEPseudo2Instr { bits<1> imm; let Inst{0} = imm; } -} + + def NAME # _VG2_Z_PSEUDO : sem2p1_zero_matrix_pseudo; + def NAME # _VG4_Z_PSEUDO : sem2p1_zero_matrix_pseudo; + def NAME # _2Z_PSEUDO : sem2p1_zero_matrix_pseudo; + def NAME # _VG2_2Z_PSEUDO : sem2p1_zero_matrix_pseudo; + def NAME # _VG4_2Z_PSEUDO : sem2p1_zero_matrix_pseudo; + def NAME # _4Z_PSEUDO : sem2p1_zero_matrix_pseudo; + def NAME # _VG2_4Z_PSEUDO : sem2p1_zero_matrix_pseudo; + def NAME # _VG4_4Z_PSEUDO : sem2p1_zero_matrix_pseudo; + + def : SME2_Zero_Matrix_Pat; + def : SME2_Zero_Matrix_Pat; + def : SME2_Zero_Matrix_Pat; + def : SME2_Zero_Matrix_Pat; + def : SME2_Zero_Matrix_Pat; + def : SME2_Zero_Matrix_Pat; + def : SME2_Zero_Matrix_Pat; + def : SME2_Zero_Matrix_Pat; +} //===----------------------------------------------------------------------===// // SME2.1 lookup table expand two non-contiguous registers diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll new file mode 100644 index 00000000000000..ba77637580f4cb --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll @@ -0,0 +1,190 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +define void @test_svzero_za64_vg1x2(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg1x2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 0, vgx2] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 %slice) + ret void +} + +define void @test_svzero_za64_vg1x2_offset(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg1x2_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 7, vgx2] +; CHECK-NEXT: ret +entry: + %slice.max = add i32 %slice, 7 + tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 %slice.max) + ret void +} + +define void @test_svzero_za64_vg1x4(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg1x4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 0, vgx4] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 %slice) + ret void +} + +define void @test_svzero_za64_vg1x4_offset(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg1x4_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 1, vgx4] +; CHECK-NEXT: ret +entry: + %slice.min = add i32 %slice, 1 + tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 %slice.min) + ret void +} + +define void @test_svzero_za64_vg2x1(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg2x1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 0:1] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 %slice) + ret void +} + +define void @test_svzero_za64_vg2x1_offset(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg2x1_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 6:7] +; CHECK-NEXT: ret +entry: + %slice.max = add i32 %slice, 6 + tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 %slice.max) + ret void +} + +define void @test_svzero_za64_vg2x2(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg2x2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 0:1, vgx2] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 %slice) + ret void +} + +define void @test_svzero_za64_vg2x2_offset(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg2x2_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 2:3, vgx2] +; CHECK-NEXT: ret +entry: + %slice.max = add i32 %slice, 2 + tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 %slice.max) + ret void +} + +define void @test_svzero_za64_vg2x4(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg2x4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 0:1, vgx4] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 %slice) + ret void +} + +define void @test_svzero_za64_vg2x4_offset(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg2x4_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w8, w0, #1 +; CHECK-NEXT: zero za.d[w8, 0:1, vgx4] +; CHECK-NEXT: ret +entry: + %slice.min = add i32 %slice, 1 + tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 %slice.min) + ret void +} + +define void @test_svzero_za64_vg4x1(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg4x1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 0:3] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 %slice) + ret void +} + +define void @test_svzero_za64_vg4x1_offset(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg4x1_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 4:7] +; CHECK-NEXT: ret +entry: + %slice.max = add i32 %slice, 4 + tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 %slice.max) + ret void +} + +define void @test_svzero_za64_vg4x2(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg4x2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 0:3, vgx2] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 %slice) + ret void +} + +define void @test_svzero_za64_vg4x2_offset(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg4x2_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 0:3, vgx2] +; CHECK-NEXT: ret +entry: + %slice.max = add i32 %slice, 0 + tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 %slice.max) + ret void +} + +define void @test_svzero_za64_vg4x4(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg4x4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: zero za.d[w8, 0:3, vgx4] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 %slice) + ret void +} + +define void @test_svzero_za64_vg4x4_offset(i32 %slice) #0 { +; CHECK-LABEL: test_svzero_za64_vg4x4_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w8, w0, #1 +; CHECK-NEXT: zero za.d[w8, 0:3, vgx4] +; CHECK-NEXT: ret +entry: + %slice.min = add i32 %slice, 1 + tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 %slice.min) + ret void +} + +attributes #0 = { nounwind "target-features" = "+sme2p1"} From 7fa45afa938e0feb0030b14a8633de7dd8e529cb Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Wed, 29 May 2024 12:52:55 +0200 Subject: [PATCH 105/230] [SPIR-V] Ensure that internal intrinsic functions are inserted at the correct positions (#93552) The goal of the PR is to ensure that newly inserted internal intrinsic functions are inserted at the correct positions, and don't break rules of instruction domination and PHI nodes grouping at top of basic block. This is a continuation of https://github.com/llvm/llvm-project/pull/92316 and https://github.com/llvm/llvm-project/pull/92536 --- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 43 ++++++++++++++----- .../SPIRV/phi-spvintrinsic-dominate.ll | 39 +++++++++++++++++ 2 files changed, 71 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index ea53fe55e7ab57..e4bbeb53d16913 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -181,6 +181,14 @@ static void setInsertPointSkippingPhis(IRBuilder<> &B, Instruction *I) { B.SetInsertPoint(I); } +static void setInsertPointAfterDef(IRBuilder<> &B, Instruction *I) { + B.SetCurrentDebugLocation(I->getDebugLoc()); + if (I->getType()->isVoidTy()) + B.SetInsertPoint(I->getNextNode()); + else + B.SetInsertPoint(*I->getInsertionPointAfterDef()); +} + static bool requireAssignType(Instruction *I) { IntrinsicInst *Intr = dyn_cast(I); if (Intr) { @@ -560,6 +568,7 @@ void SPIRVEmitIntrinsics::preprocessUndefs(IRBuilder<> &B) { while (!Worklist.empty()) { Instruction *I = Worklist.front(); + bool BPrepared = false; Worklist.pop(); for (auto &Op : I->operands()) { @@ -567,7 +576,10 @@ void SPIRVEmitIntrinsics::preprocessUndefs(IRBuilder<> &B) { if (!AggrUndef || !Op->getType()->isAggregateType()) continue; - B.SetInsertPoint(I); + if (!BPrepared) { + setInsertPointSkippingPhis(B, I); + BPrepared = true; + } auto *IntrUndef = B.CreateIntrinsic(Intrinsic::spv_undef, {}, {}); Worklist.push(IntrUndef); I->replaceUsesOfWith(Op, IntrUndef); @@ -584,6 +596,7 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) { while (!Worklist.empty()) { auto *I = Worklist.front(); + bool IsPhi = isa(I), BPrepared = false; assert(I); bool KeepInst = false; for (const auto &Op : I->operands()) { @@ -615,7 +628,11 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) { else for (auto &COp : AggrConst->operands()) Args.push_back(COp); - B.SetInsertPoint(I); + if (!BPrepared) { + IsPhi ? B.SetInsertPointPastAllocas(I->getParent()->getParent()) + : B.SetInsertPoint(I); + BPrepared = true; + } auto *CI = B.CreateIntrinsic(Intrinsic::spv_const_composite, {ResTy}, {Args}); Worklist.push(CI); @@ -1111,8 +1128,7 @@ void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I, isa(I)) return; - setInsertPointSkippingPhis(B, I->getNextNode()); - + setInsertPointAfterDef(B, I); Type *ElemTy = deduceElementType(I); Constant *EltTyConst = UndefValue::get(ElemTy); unsigned AddressSpace = getPointerAddressSpace(I->getType()); @@ -1127,7 +1143,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, reportFatalOnTokenType(I); Type *Ty = I->getType(); if (!Ty->isVoidTy() && !isPointerTy(Ty) && requireAssignType(I)) { - setInsertPointSkippingPhis(B, I->getNextNode()); + setInsertPointAfterDef(B, I); Type *TypeToAssign = Ty; if (auto *II = dyn_cast(I)) { if (II->getIntrinsicID() == Intrinsic::spv_const_composite || @@ -1149,7 +1165,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, if (isa(Op) && Op->getType()->isAggregateType()) buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op, UndefValue::get(B.getInt32Ty()), {}, B); - else if (!isa(Op)) // TODO: This case could be removed + else if (!isa(Op)) buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op, {}, B); } @@ -1159,7 +1175,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, void SPIRVEmitIntrinsics::insertSpirvDecorations(Instruction *I, IRBuilder<> &B) { if (MDNode *MD = I->getMetadata("spirv.Decorations")) { - B.SetInsertPoint(I->getNextNode()); + setInsertPointAfterDef(B, I); B.CreateIntrinsic(Intrinsic::spv_assign_decoration, {I->getType()}, {I, MetadataAsValue::get(I->getContext(), MD)}); } @@ -1170,7 +1186,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I, auto *II = dyn_cast(I); if (II && II->getIntrinsicID() == Intrinsic::spv_const_composite && TrackConstants) { - B.SetInsertPoint(I->getNextNode()); + setInsertPointAfterDef(B, I); auto t = AggrConsts.find(I); assert(t != AggrConsts.end()); auto *NewOp = @@ -1179,6 +1195,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I, I->replaceAllUsesWith(NewOp); NewOp->setArgOperand(0, I); } + bool IsPhi = isa(I), BPrepared = false; for (const auto &Op : I->operands()) { if ((isa(Op) && Op->getType()->isVectorTy()) || isa(I) || isa(I)) @@ -1188,7 +1205,11 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I, if (II && ((II->getIntrinsicID() == Intrinsic::spv_gep && OpNo == 0) || (II->paramHasAttr(OpNo, Attribute::ImmArg)))) continue; - B.SetInsertPoint(I); + if (!BPrepared) { + IsPhi ? B.SetInsertPointPastAllocas(I->getParent()->getParent()) + : B.SetInsertPoint(I); + BPrepared = true; + } Value *OpTyVal = Op; if (Op->getType()->isTargetExtTy()) OpTyVal = Constant::getNullValue( @@ -1201,7 +1222,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I, } if (I->hasName()) { reportFatalOnTokenType(I); - setInsertPointSkippingPhis(B, I->getNextNode()); + setInsertPointAfterDef(B, I); std::vector Args = {I}; addStringImm(I->getName(), B, Args); B.CreateIntrinsic(Intrinsic::spv_assign_name, {I->getType()}, Args); @@ -1345,7 +1366,7 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { for (auto *I : Worklist) { TrackConstants = true; if (!I->getType()->isVoidTy() || isa(I)) - B.SetInsertPoint(I->getNextNode()); + setInsertPointAfterDef(B, I); // Visitors return either the original/newly created instruction for further // processing, nullptr otherwise. I = visit(*I); diff --git a/llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll b/llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll new file mode 100644 index 00000000000000..471ab03ed89f65 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll @@ -0,0 +1,39 @@ +; The goal of the test is to check that newly inserted internal (spv) +; intrinsic functions for PHI's operands are inserted at the correct +; positions, and don't break rules of instruction domination and PHI nodes +; grouping at top of basic block. + +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpFunction +; CHECK: OpBranch +; CHECK: OpLabel +; CHECK: OpPhi +; CHECK: OpPhi +; CHECK: OpPhi + +define spir_kernel void @foo(ptr addrspace(1) %_arg1) { +entry: + br label %l1 + +l1: + %sw = phi <4 x double> [ %vec, %l2 ], [ , %entry ] + %in = phi <3 x double> [ %ins, %l2 ], [ zeroinitializer, %entry ] + %r1 = phi i32 [ %r2, %l2 ], [ 0, %entry ] + %c1 = icmp ult i32 %r1, 3 + br i1 %c1, label %l2, label %exit + +l2: + %r3 = zext nneg i32 %r1 to i64 + %r4 = getelementptr inbounds double, ptr addrspace(1) %_arg1, i64 %r3 + %r5 = load double, ptr addrspace(1) %r4, align 8 + %ins = insertelement <3 x double> %in, double %r5, i32 %r1 + %exp = shufflevector <3 x double> %ins, <3 x double> poison, <4 x i32> + %vec = shufflevector <4 x double> %exp, <4 x double> %sw, <4 x i32> + %r2 = add nuw nsw i32 %r1, 1 + br label %l1 + +exit: + ret void +} From f63adf3b51008970cc7c3794c68c7a6e33e8d5dd Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Wed, 29 May 2024 12:53:08 +0200 Subject: [PATCH 106/230] [SPIR-V] Introduce support of llvm.ptr.annotation to SPIR-V Backend and implement extensions which make use of spirv.Decorations (#93561) This PR introduces support of llvm.ptr.annotation to SPIR-V Backend, and implement several extensions which make use of spirv.Decorations and llvm.ptr.annotation to annotate global variables and pointers: - SPV_INTEL_cache_controls - SPV_INTEL_global_variable_host_access - SPV_INTEL_global_variable_fpga_decorations --- llvm/docs/SPIRVUsage.rst | 6 + .../SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp | 7 + llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp | 7 + llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 9 ++ .../Target/SPIRV/SPIRVPrepareFunctions.cpp | 133 ++++++++++++++++++ .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 12 ++ .../basic-load-store.ll | 53 +++++++ .../decorate-prefetch-w-cache-controls.ll | 44 ++++++ .../global-var-decorations.ll | 33 +++++ .../global-var-host-access.ll | 34 +++++ .../SPIRV/llvm-intrinsics/ptr-annotation.ll | 41 ++++++ 11 files changed, 379 insertions(+) create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll create mode 100644 llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst index 657b0fb9b6724c..de27f6b2372db6 100644 --- a/llvm/docs/SPIRVUsage.rst +++ b/llvm/docs/SPIRVUsage.rst @@ -141,10 +141,16 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na - Allows generating arbitrary width integer types. * - ``SPV_INTEL_bfloat16_conversion`` - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values. + * - ``SPV_INTEL_cache_controls`` + - Allows cache control information to be applied to memory access instructions. * - ``SPV_INTEL_function_pointers`` - Allows translation of function pointers. * - ``SPV_INTEL_inline_assembly`` - Allows to use inline assembly. + * - ``SPV_INTEL_global_variable_host_access`` + - Adds decorations that can be applied to global (module scope) variables. + * - ``SPV_INTEL_global_variable_fpga_decorations`` + - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices. * - ``SPV_INTEL_optnone`` - Adds OptNoneINTEL value for Function Control mask that indicates a request to not optimize the function. * - ``SPV_INTEL_subgroups`` diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index 5c286acdcc9b39..ff8759755e5176 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -272,6 +272,13 @@ void SPIRVInstPrinter::printOpDecorate(const MCInst *MI, raw_ostream &O) { case Decoration::UserSemantic: printStringImm(MI, NumFixedOps, O); break; + case Decoration::HostAccessINTEL: + printOperand(MI, NumFixedOps, O); + if (NumFixedOps + 1 < MI->getNumOperands()) { + O << ' '; + printStringImm(MI, NumFixedOps + 1, O); + } + break; default: printRemainingVariableOps(MI, NumFixedOps, O, true); break; diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 7f531542544ab6..75aa1823b11f2a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -30,6 +30,13 @@ static const std::map SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_min_max}, {"SPV_INTEL_arbitrary_precision_integers", SPIRV::Extension::Extension::SPV_INTEL_arbitrary_precision_integers}, + {"SPV_INTEL_cache_controls", + SPIRV::Extension::Extension::SPV_INTEL_cache_controls}, + {"SPV_INTEL_global_variable_fpga_decorations", + SPIRV::Extension::Extension:: + SPV_INTEL_global_variable_fpga_decorations}, + {"SPV_INTEL_global_variable_host_access", + SPIRV::Extension::Extension::SPV_INTEL_global_variable_host_access}, {"SPV_INTEL_optnone", SPIRV::Extension::Extension::SPV_INTEL_optnone}, {"SPV_INTEL_usm_storage_classes", SPIRV::Extension::Extension::SPV_INTEL_usm_storage_classes}, diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index c86ab285f354fd..61f99f8d852695 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -703,6 +703,15 @@ static void addOpDecorateReqs(const MachineInstr &MI, unsigned DecIndex, static_cast(LinkageOp); if (LnkType == SPIRV::LinkageType::LinkOnceODR) Reqs.addExtension(SPIRV::Extension::SPV_KHR_linkonce_odr); + } else if (Dec == SPIRV::Decoration::CacheControlLoadINTEL || + Dec == SPIRV::Decoration::CacheControlStoreINTEL) { + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_cache_controls); + } else if (Dec == SPIRV::Decoration::HostAccessINTEL) { + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_global_variable_host_access); + } else if (Dec == SPIRV::Decoration::InitModeINTEL || + Dec == SPIRV::Decoration::ImplementInRegisterMapINTEL) { + Reqs.addExtension( + SPIRV::Extension::SPV_INTEL_global_variable_fpga_decorations); } } diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index a8a0577f60564c..7bee87d7204ede 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -22,6 +22,7 @@ #include "SPIRVSubtarget.h" #include "SPIRVTargetMachine.h" #include "SPIRVUtils.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -29,6 +30,8 @@ #include "llvm/IR/IntrinsicsSPIRV.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" +#include +#include using namespace llvm; @@ -152,6 +155,132 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) { return true; } +static std::string getAnnotation(Value *AnnoVal, Value *OptAnnoVal) { + if (auto *Ref = dyn_cast_or_null(AnnoVal)) + AnnoVal = Ref->getOperand(0); + if (auto *Ref = dyn_cast_or_null(OptAnnoVal)) + OptAnnoVal = Ref->getOperand(0); + + std::string Anno; + if (auto *C = dyn_cast_or_null(AnnoVal)) { + StringRef Str; + if (getConstantStringInfo(C, Str)) + Anno = Str; + } + // handle optional annotation parameter in a way that Khronos Translator do + // (collect integers wrapped in a struct) + if (auto *C = dyn_cast_or_null(OptAnnoVal); + C && C->getNumOperands()) { + Value *MaybeStruct = C->getOperand(0); + if (auto *Struct = dyn_cast(MaybeStruct)) { + for (unsigned I = 0, E = Struct->getNumOperands(); I != E; ++I) { + if (auto *CInt = dyn_cast(Struct->getOperand(I))) + Anno += (I == 0 ? ": " : ", ") + + std::to_string(CInt->getType()->getIntegerBitWidth() == 1 + ? CInt->getZExtValue() + : CInt->getSExtValue()); + } + } else if (auto *Struct = dyn_cast(MaybeStruct)) { + // { i32 i32 ... } zeroinitializer + for (unsigned I = 0, E = Struct->getType()->getStructNumElements(); + I != E; ++I) + Anno += I == 0 ? ": 0" : ", 0"; + } + } + return Anno; +} + +static SmallVector parseAnnotation(Value *I, + const std::string &Anno, + LLVMContext &Ctx, + Type *Int32Ty) { + // Try to parse the annotation string according to the following rules: + // annotation := ({kind} | {kind:value,value,...})+ + // kind := number + // value := number | string + static const std::regex R( + "\\{(\\d+)(?:[:,](\\d+|\"[^\"]*\")(?:,(\\d+|\"[^\"]*\"))*)?\\}"); + SmallVector MDs; + int Pos = 0; + for (std::sregex_iterator + It = std::sregex_iterator(Anno.begin(), Anno.end(), R), + ItEnd = std::sregex_iterator(); + It != ItEnd; ++It) { + if (It->position() != Pos) + return SmallVector{}; + Pos = It->position() + It->length(); + std::smatch Match = *It; + SmallVector MDsItem; + for (std::size_t i = 1; i < Match.size(); ++i) { + std::ssub_match SMatch = Match[i]; + std::string Item = SMatch.str(); + if (Item.length() == 0) + break; + if (Item[0] == '"') { + Item = Item.substr(1, Item.length() - 2); + // Acceptable format of the string snippet is: + static const std::regex RStr("^(\\d+)(?:,(\\d+))*$"); + if (std::smatch MatchStr; std::regex_match(Item, MatchStr, RStr)) { + for (std::size_t SubIdx = 1; SubIdx < MatchStr.size(); ++SubIdx) + if (std::string SubStr = MatchStr[SubIdx].str(); SubStr.length()) + MDsItem.push_back(ConstantAsMetadata::get( + ConstantInt::get(Int32Ty, std::stoi(SubStr)))); + } else { + MDsItem.push_back(MDString::get(Ctx, Item)); + } + } else if (int32_t Num; + std::from_chars(Item.data(), Item.data() + Item.size(), Num) + .ec == std::errc{}) { + MDsItem.push_back( + ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Num))); + } else { + MDsItem.push_back(MDString::get(Ctx, Item)); + } + } + if (MDsItem.size() == 0) + return SmallVector{}; + MDs.push_back(MDNode::get(Ctx, MDsItem)); + } + return Pos == static_cast(Anno.length()) ? MDs + : SmallVector{}; +} + +static void lowerPtrAnnotation(IntrinsicInst *II) { + LLVMContext &Ctx = II->getContext(); + Type *Int32Ty = Type::getInt32Ty(Ctx); + + // Retrieve an annotation string from arguments. + Value *PtrArg = nullptr; + if (auto *BI = dyn_cast(II->getArgOperand(0))) + PtrArg = BI->getOperand(0); + else + PtrArg = II->getOperand(0); + std::string Anno = + getAnnotation(II->getArgOperand(1), + 4 < II->arg_size() ? II->getArgOperand(4) : nullptr); + + // Parse the annotation. + SmallVector MDs = parseAnnotation(II, Anno, Ctx, Int32Ty); + + // If the annotation string is not parsed successfully we don't know the + // format used and output it as a general UserSemantic decoration. + // Otherwise MDs is a Metadata tuple (a decoration list) in the format + // expected by `spirv.Decorations`. + if (MDs.size() == 0) { + auto UserSemantic = ConstantAsMetadata::get(ConstantInt::get( + Int32Ty, static_cast(SPIRV::Decoration::UserSemantic))); + MDs.push_back(MDNode::get(Ctx, {UserSemantic, MDString::get(Ctx, Anno)})); + } + + // Build the internal intrinsic function. + IRBuilder<> IRB(II->getParent()); + IRB.SetInsertPoint(II); + IRB.CreateIntrinsic( + Intrinsic::spv_assign_decoration, {PtrArg->getType()}, + {PtrArg, MetadataAsValue::get(Ctx, MDNode::get(Ctx, MDs))}); + II->replaceAllUsesWith(II->getOperand(0)); +} + static void lowerFunnelShifts(IntrinsicInst *FSHIntrinsic) { // Get a separate function - otherwise, we'd have to rework the CFG of the // current one. Then simply replace the intrinsic uses with a call to the new @@ -334,6 +463,10 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { Changed |= toSpvOverloadedIntrinsic( II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1}); break; + case Intrinsic::ptr_annotation: + lowerPtrAnnotation(II); + Changed = true; + break; } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 98cbd9d2c1f2e4..65b48c8acf6ab7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -299,6 +299,9 @@ defm SPV_INTEL_function_pointers : ExtensionOperand<104>; defm SPV_INTEL_variable_length_array : ExtensionOperand<105>; defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>; defm SPV_INTEL_inline_assembly : ExtensionOperand<107>; +defm SPV_INTEL_cache_controls : ExtensionOperand<108>; +defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>; +defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -471,6 +474,10 @@ defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variabl defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>; defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>; defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>; +defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>; +defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>; +defm GlobalVariableFPGADecorationsINTEL : CapabilityOperand<6189, 0, 0, [SPV_INTEL_global_variable_fpga_decorations], []>; +defm CacheControlsINTEL : CapabilityOperand<6441, 0, 0, [SPV_INTEL_cache_controls], []>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time @@ -1206,6 +1213,11 @@ defm ReferencedIndirectlyINTEL : DecorationOperand<5602, 0, 0, [], [IndirectRefe defm ClobberINTEL : DecorationOperand<5607, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>; defm SideEffectsINTEL : DecorationOperand<5608, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>; defm ArgumentAttributeINTEL : DecorationOperand<6409, 0, 0, [], [FunctionPointersINTEL]>; +defm CacheControlLoadINTEL : DecorationOperand<6442, 0, 0, [], [CacheControlsINTEL]>; +defm CacheControlStoreINTEL : DecorationOperand<6443, 0, 0, [], [CacheControlsINTEL]>; +defm HostAccessINTEL : DecorationOperand<6188, 0, 0, [], [GlobalVariableHostAccessINTEL]>; +defm InitModeINTEL : DecorationOperand<6190, 0, 0, [], [GlobalVariableFPGADecorationsINTEL]>; +defm ImplementInRegisterMapINTEL : DecorationOperand<6191, 0, 0, [], [GlobalVariableFPGADecorationsINTEL]>; //===----------------------------------------------------------------------===// // Multiclass used to define BuiltIn enum values and at the same time diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll new file mode 100644 index 00000000000000..359f6d1c0f8e53 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll @@ -0,0 +1,53 @@ +; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_cache_controls + +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV: Capability CacheControlsINTEL +; CHECK-SPIRV: Extension "SPV_INTEL_cache_controls" +; CHECK-SPIRV-DAG: OpName %[[#GVar:]] "G" +; CHECK-SPIRV-DAG: OpName %[[#Arg:]] "buffer" +; CHECK-SPIRV-DAG: OpDecorate %[[#GVar]] CacheControlStoreINTEL 0 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#GVar]] CacheControlStoreINTEL 1 3 +; CHECK-SPIRV-DAG: OpDecorate %[[#Arg]] CacheControlLoadINTEL 0 0 +; CHECK-SPIRV-DAG: OpDecorate %[[#Arg]] CacheControlStoreINTEL 0 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#LoadPtr:]] CacheControlLoadINTEL 0 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#LoadPtr]] CacheControlLoadINTEL 1 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#StorePtr:]] CacheControlStoreINTEL 0 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#StorePtr]] CacheControlStoreINTEL 1 2 +; CHECK-SPIRV: OpLoad %[[#]] %[[#LoadPtr]] +; CHECK-SPIRV: OpStore %[[#StorePtr]] %[[#]] + +@G = common addrspace(1) global i32 0, align 4, !spirv.Decorations !9 + +define spir_kernel void @test(ptr addrspace(1) %dummy, ptr addrspace(1) %buffer) !spirv.ParameterDecorations !12 { +entry: + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %buffer, i64 1, !spirv.Decorations !3 + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %buffer, i64 0, !spirv.Decorations !6 + store i32 %0, ptr addrspace(1) %arrayidx1, align 4 + ret void +} + +!spirv.MemoryModel = !{!0} +!spirv.Source = !{!1} +!opencl.spir.version = !{!2} +!opencl.ocl.version = !{!2} + +!0 = !{i32 2, i32 2} +!1 = !{i32 3, i32 102000} +!2 = !{i32 1, i32 2} +!3 = !{!4, !5} +!4 = !{i32 6442, i32 0, i32 1} ; {CacheControlLoadINTEL, CacheLevel=0, Cached} +!5 = !{i32 6442, i32 1, i32 1} ; {CacheControlLoadINTEL, CacheLevel=1, Cached} +!6 = !{!7, !8} +!7 = !{i32 6443, i32 0, i32 1} ; {CacheControlStoreINTEL, CacheLevel=0, WriteThrough} +!8 = !{i32 6443, i32 1, i32 2} ; {CacheControlStoreINTEL, CacheLevel=1, WriteBack} +!9 = !{!10, !11} +!10 = !{i32 6443, i32 0, i32 1} ; {CacheControlStoreINTEL, CacheLevel=0, WriteThrough} +!11 = !{i32 6443, i32 1, i32 3} ; {CacheControlStoreINTEL, CacheLevel=1, Streaming} +!12 = !{!13, !14} +!13 = !{} +!14 = !{!15, !16} +!15 = !{i32 6442, i32 0, i32 0} ; {CacheControlLoadINTEL, CacheLevel=0, Uncached} +!16 = !{i32 6443, i32 0, i32 1} ; {CacheControlStoreINTEL, CacheLevel=0, WriteThrough} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll new file mode 100644 index 00000000000000..9a13b720f61f74 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll @@ -0,0 +1,44 @@ +; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_cache_controls + +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV: Capability CacheControlsINTEL +; CHECK-SPIRV: Extension "SPV_INTEL_cache_controls" + +; CHECK-SPIRV-DAG: OpName %[[#Ptr1:]] "ptr1" +; CHECK-SPIRV-DAG: OpName %[[#Ptr2:]] "ptr2" +; CHECK-SPIRV-DAG: OpName %[[#Ptr3:]] "ptr3" +; CHECK-SPIRV-DAG: OpDecorate %[[#Ptr1]] CacheControlLoadINTEL 0 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#Ptr2]] CacheControlLoadINTEL 1 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#Ptr3]] CacheControlStoreINTEL 2 3 +; CHECK-SPIRV: OpExtInst %[[#]] %[[#]] prefetch %[[#Ptr1]] %[[#]] +; CHECK-SPIRV: OpExtInst %[[#]] %[[#]] prefetch %[[#Ptr2]] %[[#]] +; CHECK-SPIRV: OpExtInst %[[#]] %[[#]] prefetch %[[#Ptr3]] %[[#]] + +; 6442 stands for CacheControlLoadINTEL token +@.str.1 = private unnamed_addr addrspace(1) constant [16 x i8] c"../prefetch.hpp\00", section "llvm.metadata" +@.str.9 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\220,1\22}\00", section "llvm.metadata" +@.str.10 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\221,1\22}\00", section "llvm.metadata" +@.str.11 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6443:\222,3\22}\00", section "llvm.metadata" + +define weak_odr dso_local spir_kernel void @foo(ptr addrspace(1) noundef align 1 %_arg_dataPtr) { +entry: + %r0 = addrspacecast ptr addrspace(1) %_arg_dataPtr to ptr addrspace(4) + %ptr1 = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %r0, i32 noundef 5) + %r1 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %ptr1, ptr addrspace(1) @.str.9, ptr addrspace(1) @.str.1, i32 76, ptr addrspace(1) null) + tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %r1, i64 noundef 1) + %arrayidx3.i = getelementptr inbounds i8, ptr addrspace(4) %r0, i64 1 + %ptr2 = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %arrayidx3.i, i32 noundef 5) + %r2 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %ptr2, ptr addrspace(1) @.str.10, ptr addrspace(1) @.str.1, i32 80, ptr addrspace(1) null) + tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %r2, i64 noundef 1) + %arrayidx7.i = getelementptr inbounds i8, ptr addrspace(4) %r0, i64 2 + %ptr3 = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %arrayidx7.i, i32 noundef 5) + %r3 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %ptr3, ptr addrspace(1) @.str.11, ptr addrspace(1) @.str.1, i32 80, ptr addrspace(1) null) + tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %r3, i64 noundef 2) + ret void +} + +declare ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), i32, ptr addrspace(1)) +declare dso_local spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef, i64 noundef) +declare dso_local spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef, i32 noundef) diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll new file mode 100644 index 00000000000000..40008873bf19bf --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll @@ -0,0 +1,33 @@ +; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_global_variable_fpga_decorations + +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_fpga_decorations %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV: Capability GlobalVariableFPGADecorationsINTEL +; CHECK-SPIRV: Extension "SPV_INTEL_global_variable_fpga_decorations" +; CHECK-SPIRV-DAG: OpName %[[#G1:]] "int_var" +; CHECK-SPIRV-DAG: OpName %[[#G2:]] "float_var" +; CHECK-SPIRV-DAG: OpName %[[#G3:]] "bool_var" +; CHECK-SPIRV-DAG: OpDecorate %[[#G1]] ImplementInRegisterMapINTEL 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#G1]] InitModeINTEL 0 +; CHECK-SPIRV-DAG: OpDecorate %[[#G2]] ImplementInRegisterMapINTEL 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#G2]] InitModeINTEL 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] ImplementInRegisterMapINTEL 0 +; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] InitModeINTEL 0 + +@int_var = addrspace(1) global i32 42, !spirv.Decorations !1 +@float_var = addrspace(1) global float 1.0, !spirv.Decorations !5 +@bool_var = addrspace(1) global i1 0, !spirv.Decorations !7 + +define spir_kernel void @test() { +entry: + ret void +} + +!1 = !{!2, !3} +!2 = !{i32 6191, i1 true} ; ImplementInRegisterMapINTEL = true +!3 = !{i32 6190, i32 0} ; InitModeINTEL = 0 +!4 = !{i32 6190, i32 1} ; InitModeINTEL = 1 +!5 = !{!2, !4} +!6 = !{i32 6191, i1 false} ; ImplementInRegisterMapINTEL = false +!7 = !{!6, !3} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll new file mode 100644 index 00000000000000..1397435efb2d4f --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll @@ -0,0 +1,34 @@ +; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_global_variable_host_access + +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_host_access,+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_host_access,+SPV_INTEL_global_variable_fpga_decorations %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV-DAG: Capability GlobalVariableHostAccessINTEL +; CHECK-SPIRV-DAG: Capability GlobalVariableFPGADecorationsINTEL +; CHECK-SPIRV-DAG: Extension "SPV_INTEL_global_variable_host_access" +; CHECK-SPIRV-DAG: Extension "SPV_INTEL_global_variable_fpga_decorations" + +; CHECK-SPIRV-DAG: OpName %[[#G1:]] "int_var" +; CHECK-SPIRV-DAG: OpName %[[#G2:]] "bool_var" +; CHECK-SPIRV-DAG: OpName %[[#G3:]] "float_var" +; CHECK-SPIRV-DAG: OpDecorate %[[#G1]] HostAccessINTEL 1 "IntVarName" +; CHECK-SPIRV-DAG: OpDecorate %[[#G2]] HostAccessINTEL 3 "BoolVarName" +; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] ImplementInRegisterMapINTEL 1 +; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] InitModeINTEL 1 + +@int_var = addrspace(1) global i32 42, !spirv.Decorations !1 +@bool_var = addrspace(1) global i1 0, !spirv.Decorations !4 +@float_var = addrspace(1) global float 1.0, !spirv.Decorations !5 + +define spir_kernel void @test() { +entry: + ret void +} + +!1 = !{!2} +!2 = !{i32 6188, i32 1, !"IntVarName"} ; HostAccessINTEL 1 "IntVarName" +!3 = !{i32 6188, i32 3, !"BoolVarName"} ; HostAccessINTEL 3 "BoolVarName" +!4 = !{!3} +!5 = !{!6, !7} +!6 = !{i32 6191, i1 true} ; ImplementInRegisterMapINTEL = true +!7 = !{i32 6190, i32 1} ; InitModeINTEL = 1 diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll new file mode 100644 index 00000000000000..06f1d0bf7fd37c --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll @@ -0,0 +1,41 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: OpName %[[#Foo:]] "foo" +; CHECK-DAG: OpName %[[#Ptr1:]] "_arg1" +; CHECK-DAG: OpName %[[#Ptr2:]] "_arg2" +; CHECK-DAG: OpName %[[#Ptr3:]] "_arg3" +; CHECK-DAG: OpName %[[#Ptr4:]] "_arg4" +; CHECK-DAG: OpName %[[#Ptr5:]] "_arg5" +; CHECK-DAG: OpDecorate %[[#Ptr1]] NonReadable +; CHECK-DAG: OpDecorate %[[#Ptr2]] Alignment 128 +; CHECK-DAG: OpDecorate %[[#Ptr2]] NonReadable +; CHECK-DAG: OpDecorate %[[#Ptr3]] Alignment 128 +; CHECK-DAG: OpDecorate %[[#Ptr3]] NonReadable +; CHECK-DAG: OpDecorate %[[#Ptr4]] Alignment 128 +; CHECK-DAG: OpDecorate %[[#Ptr4]] NonReadable +; CHECK-DAG: OpDecorate %[[#Ptr5]] UserSemantic "Unknown format" +; CHECK: %[[#Foo]] = OpFunction +; CHECK-NEXT: %[[#Ptr1]] = OpFunctionParameter +; CHECK-NEXT: %[[#Ptr2]] = OpFunctionParameter +; CHECK-NEXT: %[[#Ptr3]] = OpFunctionParameter +; CHECK-NEXT: %[[#Ptr4]] = OpFunctionParameter +; CHECK-NEXT: %[[#Ptr5]] = OpFunctionParameter +; CHECK: OpFunctionEnd + +@.str.0 = private unnamed_addr addrspace(1) constant [16 x i8] c"../prefetch.hpp\00", section "llvm.metadata" +@.str.1 = private unnamed_addr addrspace(1) constant [5 x i8] c"{25}\00", section "llvm.metadata" +@.str.2 = private unnamed_addr addrspace(1) constant [13 x i8] c"{44:128}{25}\00", section "llvm.metadata" +@.str.3 = private unnamed_addr addrspace(1) constant [15 x i8] c"{44:\22128\22}{25}\00", section "llvm.metadata" +@.str.4 = private unnamed_addr addrspace(1) constant [13 x i8] c"{44,128}{25}\00", section "llvm.metadata" +@.str.5 = private unnamed_addr addrspace(1) constant [15 x i8] c"Unknown format\00", section "llvm.metadata" + +define spir_kernel void @foo(ptr addrspace(1) %_arg1, ptr addrspace(1) %_arg2, ptr addrspace(1) %_arg3, ptr addrspace(1) %_arg4, ptr addrspace(1) %_arg5) { +entry: + %r1 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg1, ptr addrspace(1) @.str.1, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null) + %r2 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg2, ptr addrspace(1) @.str.2, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null) + %r3 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg3, ptr addrspace(1) @.str.3, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null) + %r4 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg4, ptr addrspace(1) @.str.4, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null) + %r5 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg5, ptr addrspace(1) @.str.5, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null) + ret void +} From 7c917e8268225735bf6fe0f7d8491fc944358e47 Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Wed, 29 May 2024 12:53:37 +0200 Subject: [PATCH 107/230] [SPIR-V] Implement correct zeroinitializer for extension types in SPIR-V Backend (#93607) This PR implements correct zeroinitializer for extension types in SPIR-V Backend. Previous version has just created 0 of 32/64 integer type (depending on target machine word size), that caused re-use and type re-write of the corresponding integer constant 0 with a potential crash on wrong usage of the constant (i.e., 0 of integer type expected but extension type found). E.g., the following code would crash without the PR: ``` %r1 = icmp ne i64 %_arg_i, 0 %e1 = tail call spir_func target("spirv.Event") @__spirv_GroupAsyncCopy(i32 2, ptr addrspace(3) %_arg_local, ptr addrspace(1) %_arg_ptr, i64 1, i64 1, target("spirv.Event") zeroinitializer) ``` because 0 in icmp would eventually be of `Event` type. --- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 3 +- llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp | 29 +++++++++++++------ llvm/test/CodeGen/SPIRV/event-zero-const.ll | 23 +++++++++++++++ 3 files changed, 44 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/event-zero-const.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index e4bbeb53d16913..ffbd1e17bad5e7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -1212,8 +1212,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I, } Value *OpTyVal = Op; if (Op->getType()->isTargetExtTy()) - OpTyVal = Constant::getNullValue( - IntegerType::get(I->getContext(), GR->getPointerSize())); + OpTyVal = PoisonValue::get(Op->getType()); auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant, {Op->getType(), OpTyVal->getType()}, Op, OpTyVal, {}, B); diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 85299a49a6b94d..624899600693ac 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -40,6 +40,7 @@ class SPIRVPreLegalizer : public MachineFunctionPass { static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR, + const SPIRVSubtarget &STI, DenseMap &TargetExtConstTypes) { MachineRegisterInfo &MRI = MF.getRegInfo(); DenseMap RegsAlreadyAddedToDT; @@ -82,8 +83,17 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR, if (Const->getType()->isTargetExtTy()) { // remember association so that we can restore it when assign types MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); - if (SrcMI && SrcMI->getOpcode() == TargetOpcode::G_CONSTANT) + if (SrcMI && (SrcMI->getOpcode() == TargetOpcode::G_CONSTANT || + SrcMI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)) TargetExtConstTypes[SrcMI] = Const->getType(); + if (Const->isNullValue()) { + MachineIRBuilder MIB(MF); + SPIRVType *ExtType = + GR->getOrCreateSPIRVType(Const->getType(), MIB); + SrcMI->setDesc(STI.getInstrInfo()->get(SPIRV::OpConstantNull)); + SrcMI->addOperand(MachineOperand::CreateReg( + GR->getSPIRVTypeID(ExtType), false)); + } } } else { RegsAlreadyAddedToDT[&MI] = Reg; @@ -394,6 +404,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, for (auto MII = std::prev(MBB->end()), Begin = MBB->begin(); !ReachedBegin;) { MachineInstr &MI = *MII; + unsigned MIOp = MI.getOpcode(); if (isSpvIntrinsic(MI, Intrinsic::spv_assign_ptr_type)) { Register Reg = MI.getOperand(1).getReg(); @@ -419,9 +430,9 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE) insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MF.getRegInfo()); ToErase.push_back(&MI); - } else if (MI.getOpcode() == TargetOpcode::G_CONSTANT || - MI.getOpcode() == TargetOpcode::G_FCONSTANT || - MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR) { + } else if (MIOp == TargetOpcode::G_CONSTANT || + MIOp == TargetOpcode::G_FCONSTANT || + MIOp == TargetOpcode::G_BUILD_VECTOR) { // %rc = G_CONSTANT ty Val // ===> // %cty = OpType* ty @@ -435,15 +446,15 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, continue; } Type *Ty = nullptr; - if (MI.getOpcode() == TargetOpcode::G_CONSTANT) { + if (MIOp == TargetOpcode::G_CONSTANT) { auto TargetExtIt = TargetExtConstTypes.find(&MI); Ty = TargetExtIt == TargetExtConstTypes.end() ? MI.getOperand(1).getCImm()->getType() : TargetExtIt->second; - } else if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) { + } else if (MIOp == TargetOpcode::G_FCONSTANT) { Ty = MI.getOperand(1).getFPImm()->getType(); } else { - assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + assert(MIOp == TargetOpcode::G_BUILD_VECTOR); Type *ElemTy = nullptr; MachineInstr *ElemMI = MRI.getVRegDef(MI.getOperand(1).getReg()); assert(ElemMI); @@ -459,7 +470,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, Ty = VectorType::get(ElemTy, NumElts, false); } insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI); - } else if (MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE) { + } else if (MIOp == TargetOpcode::G_GLOBAL_VALUE) { propagateSPIRVType(&MI, GR, MRI, MIB); } @@ -802,7 +813,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) { MachineIRBuilder MIB(MF); // a registry of target extension constants DenseMap TargetExtConstTypes; - addConstantsToTrack(MF, GR, TargetExtConstTypes); + addConstantsToTrack(MF, GR, ST, TargetExtConstTypes); foldConstantsIntoIntrinsics(MF); insertBitcasts(MF, GR, MIB); generateAssignInstrs(MF, GR, MIB, TargetExtConstTypes); diff --git a/llvm/test/CodeGen/SPIRV/event-zero-const.ll b/llvm/test/CodeGen/SPIRV/event-zero-const.ll new file mode 100644 index 00000000000000..b40456d233f12f --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/event-zero-const.ll @@ -0,0 +1,23 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: %[[#LongTy:]] = OpTypeInt 64 0 +; CHECK: %[[#EventTy:]] = OpTypeEvent +; CHECK: %[[#LongNull:]] = OpConstantNull %[[#LongTy]] +; CHECK: %[[#EventNull:]] = OpConstantNull %[[#EventTy]] +; CHECK: OpFunction +; CHECK: OpINotEqual %[[#]] %[[#]] %[[#LongNull]] +; CHECK: OpGroupAsyncCopy %[[#EventTy]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#EventNull]] + + +define weak_odr dso_local spir_kernel void @foo(i64 %_arg_i, ptr addrspace(1) %_arg_ptr, ptr addrspace(3) %_arg_local) { +entry: + %r1 = icmp ne i64 %_arg_i, 0 + %e1 = tail call spir_func target("spirv.Event") @__spirv_GroupAsyncCopy(i32 2, ptr addrspace(3) %_arg_local, ptr addrspace(1) %_arg_ptr, i64 1, i64 1, target("spirv.Event") zeroinitializer) + ret void +} + +declare dso_local spir_func target("spirv.Event") @__spirv_GroupAsyncCopy(i32, ptr addrspace(3), ptr addrspace(1), i64, i64, target("spirv.Event")) From 42a0fb2333344077dc8aafd65b50d0ece886cf4e Mon Sep 17 00:00:00 2001 From: zjgarvey <47986913+zjgarvey@users.noreply.github.com> Date: Wed, 29 May 2024 05:55:05 -0500 Subject: [PATCH 108/230] [mlir][linalg] Add linalg.conv_2d_ngchw_gfchw_q to named ops (#92136) Adds a named op: linalg.conv_2d_ngchw_gfchw_q. This op is similar to linalg.conv_2d_ngchw_gfchw, but additionally incorporates zero point offset corrections. --- .../Linalg/IR/LinalgNamedStructuredOps.yaml | 138 ++++++++++++++++++ .../linalg/opdsl/ops/core_named_ops.py | 35 +++++ .../Dialect/Linalg/generalize-named-ops.mlir | 31 ++++ mlir/test/Dialect/Linalg/named-ops.mlir | 15 ++ 4 files changed, 219 insertions(+) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml index eb7dd37010a672..fad234a9dcae9c 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -3478,6 +3478,144 @@ structured_op: !LinalgStructuredOpConfig - !ScalarExpression scalar_arg: K --- !LinalgOpConfig +metadata: !LinalgOpMetadata + name: conv_2d_ngchw_gfchw_q + cpp_class_name: Conv2DNgchwGfchwQOp + doc: |- + Performs 2-D grouped convolution with zero-point offsets. + + Layout: + * Input: NGCHW. + * Kernel: GFCHW. + + Numeric casting is performed on the operands to the inner multiply, promoting + them to the same data type as the accumulator/output. This includes the zero + point offsets common to quantized operations. + implements: + - LinalgConvolutionOpInterface +structured_op: !LinalgStructuredOpConfig + args: + - !LinalgOperandDefConfig + name: I + kind: input_tensor + type_var: T1 + shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] -> + (s0, s1, s2, s3 * s4 + s5 * s6, s7 * s8 + s9 * s10)> + - !LinalgOperandDefConfig + name: K + kind: input_tensor + type_var: T2 + shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] -> + (s1, s11, s2, s5, s9)> + - !LinalgOperandDefConfig + name: IZp + kind: scalar + type_var: I32 + - !LinalgOperandDefConfig + name: KZp + kind: scalar + type_var: I32 + - !LinalgOperandDefConfig + name: O + kind: output_tensor + type_var: U + shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] -> + (s0, s1, s11, s3, s7)> + - !LinalgOperandDefConfig + name: strides + kind: index_attr + index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] + -> (s4, s8)> + default_indices: + - 1 + - 1 + - !LinalgOperandDefConfig + name: dilations + kind: index_attr + index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] + -> (s6, s10)> + default_indices: + - 1 + - 1 + indexing_maps: !LinalgIndexingMapsConfig + static_indexing_maps: + - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7, + s8, s9, s10, s11] -> (d0, d1, d5, d3 * s4 + d6 * s6, d4 * s8 + d7 * s10)> + - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7, + s8, s9, s10, s11] -> (d1, d2, d5, d6, d7)> + - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7, + s8, s9, s10, s11] -> ()> + - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7, + s8, s9, s10, s11] -> ()> + - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7, + s8, s9, s10, s11] -> (d0, d1, d2, d3, d4)> + iterator_types: + - parallel + - parallel + - parallel + - parallel + - parallel + - reduction + - reduction + - reduction + assignments: + - !ScalarAssign + arg: O + value: !ScalarExpression + scalar_fn: + kind: binary + fn_name: add + operands: + - !ScalarExpression + scalar_arg: O + - !ScalarExpression + scalar_fn: + kind: binary + fn_name: mul + operands: + - !ScalarExpression + scalar_fn: + kind: binary + fn_name: sub + operands: + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: U + operands: + - !ScalarExpression + scalar_arg: I + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: U + operands: + - !ScalarExpression + scalar_arg: IZp + - !ScalarExpression + scalar_fn: + kind: binary + fn_name: sub + operands: + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: U + operands: + - !ScalarExpression + scalar_arg: K + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: U + operands: + - !ScalarExpression + scalar_arg: KZp +--- !LinalgOpConfig metadata: !LinalgOpMetadata name: conv_3d_ndhwc_dhwcf cpp_class_name: Conv3DNdhwcDhwcfOp diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index d73428a0f4df3b..43410aaa6af1be 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -958,6 +958,41 @@ def conv_2d_ngchw_gfchw( ) * TypeFn.cast_signed(U, K[D.g, D.fg, D.c, D.kh, D.kw]) +@linalg_structured_op +def conv_2d_ngchw_gfchw_q( + I=TensorDef( + T1, S.N, S.G, S.C, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW + ), + K=TensorDef(T2, S.G, S.FG, S.C, S.KH, S.KW), + IZp=ScalarDef(I32), + KZp=ScalarDef(I32), + O=TensorDef(U, S.N, S.G, S.FG, S.OH, S.OW, output=True), + strides=IndexAttrDef(S.SH, S.SW, default=[1, 1]), + dilations=IndexAttrDef(S.DH, S.DW, default=[1, 1]), +): + """Performs 2-D grouped convolution with zero-point offsets. + + Layout: + * Input: NGCHW. + * Kernel: GFCHW. + + Numeric casting is performed on the operands to the inner multiply, promoting + them to the same data type as the accumulator/output. This includes the zero + point offsets common to quantized operations. + """ + implements(ConvolutionOpInterface) + domain(D.n, D.g, D.fg, D.oh, D.ow, D.c, D.kh, D.kw) + O[D.n, D.g, D.fg, D.oh, D.ow] += ( + TypeFn.cast_signed( + U, I[D.n, D.g, D.c, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW] + ) + - TypeFn.cast_signed(U, IZp) + ) * ( + TypeFn.cast_signed(U, K[D.g, D.fg, D.c, D.kh, D.kw]) + - TypeFn.cast_signed(U, KZp) + ) + + @linalg_structured_op def conv_3d_ndhwc_dhwcf( I=TensorDef( diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir index 4f43ec2c9e1cee..31fac9b4b41659 100644 --- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir +++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir @@ -204,6 +204,37 @@ func.func @conv_1d_ncw_fcw(%input: memref, %filter: memref // ----- +func.func @conv_2d_ngchw_gfchw_q(%input: memref, %filter: memref, %inputzp: i32, %filterzp: i32, %output: memref) { + linalg.conv_2d_ngchw_gfchw_q {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%input, %filter, %inputzp, %filterzp: memref, memref, i32, i32) + outs (%output: memref) + return +} +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d5, d3 + d6, d4 + d7)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d2, d5, d6, d7)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> ()> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4)> + +// CHECK: func @conv_2d_ngchw_gfchw_q + +// CHECK: linalg.generic +// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP2]], #[[MAP3]]] +// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} +// CHECK-SAME: ins(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : memref, memref, i32, i32) +// CHECK-SAME: outs(%{{.+}} : memref) + +// CHECK: ^{{.+}}(%[[BBARG0:.+]]: i8, %[[BBARG1:.+]]: i8, %[[BBARG2:.+]]: i32, %[[BBARG3:.+]]: i32, %[[BBARG4:.+]]: i32) +// CHECK-NEXT: %[[EXTSI0:.+]] = arith.extsi %[[BBARG0]] : i8 to i32 +// CHECK-NEXT: %[[SUB0:.+]] = arith.subi %[[EXTSI0]], %[[BBARG2]] : i32 +// CHECK-NEXT: %[[EXTSI1:.+]] = arith.extsi %[[BBARG1]] : i8 to i32 +// CHECK-NEXT: %[[SUB1:.+]] = arith.subi %[[EXTSI1]], %[[BBARG3]] : i32 +// CHECK-NEXT: %[[MUL:.+]] = arith.muli %[[SUB0]], %[[SUB1]] : i32 +// CHECK-NEXT: %[[ADD:.+]] = arith.addi %[[BBARG4]], %[[MUL]] : i32 +// CHECK-NEXT: linalg.yield %[[ADD]] : i32 + +// ----- + func.func @generalize_fill(%output: memref, %value : f32) { linalg.fill ins(%value : f32) outs(%output : memref) return diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 051054e67edf09..02ecbed232c8b5 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -441,6 +441,21 @@ func.func @conv_2d_ngchw_gfchw(%input: tensor<1x5x3x32x32xf32>, %filter: tensor< // ----- +// CHECK-LABEL: func @conv_2d_ngchw_gfchw_q +func.func @conv_2d_ngchw_gfchw_q(%input: tensor<1x5x3x32x32xi8>, %filter: tensor<5x2x3x3x3xi8>, %inputzp: i32, %filterzp: i32, %init: tensor<1x5x2x30x30xi32>) -> tensor<1x5x2x30x30xi32> { + // CHECK: linalg.conv_2d_ngchw_gfchw_q + // CHECK-SAME: dilations = dense<1> : tensor<2xi64> + // CHECK-SAME: strides = dense<1> : tensor<2xi64> + // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<1x5x3x32x32xi8>, tensor<5x2x3x3x3xi8>, i32, i32) + // CHECK-SAME: outs(%{{.+}} : tensor<1x5x2x30x30xi32>) -> tensor<1x5x2x30x30xi32> + %0 = linalg.conv_2d_ngchw_gfchw_q {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%input, %filter, %inputzp, %filterzp: tensor<1x5x3x32x32xi8>, tensor<5x2x3x3x3xi8>, i32, i32) + outs (%init: tensor<1x5x2x30x30xi32>) -> tensor<1x5x2x30x30xi32> + return %0 : tensor<1x5x2x30x30xi32> +} +// ----- + // CHECK-LABEL: func @conv_3d_ndhwc_dhwcf func.func @conv_3d_ndhwc_dhwcf(%input: tensor, %filter: tensor, %init: tensor) -> tensor { // CHECK: %{{.+}} = linalg.conv_3d_ndhwc_dhwcf From a860e89028a004bc5b46ce0952b75d4f85a5927d Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 29 May 2024 11:58:09 +0100 Subject: [PATCH 109/230] [RISCV] Don't recompute getDemanded in RISCVInsertVSETVLI::needVSETVLI. NFC This also makes the function a bit easier to reason about since we can remove the assert. Eventually we might be able to replace needVSETVLI with VSETVLIInfo::isCompatible. --- llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index c0b2a695b8ea45..2c0a807e446856 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -882,7 +882,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass { StringRef getPassName() const override { return RISCV_INSERT_VSETVLI_NAME; } private: - bool needVSETVLI(const MachineInstr &MI, const VSETVLIInfo &Require, + bool needVSETVLI(const DemandedFields &Used, const VSETVLIInfo &Require, const VSETVLIInfo &CurInfo) const; bool needVSETVLIPHI(const VSETVLIInfo &Require, const MachineBasicBlock &MBB) const; @@ -1175,17 +1175,13 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, } /// Return true if a VSETVLI is required to transition from CurInfo to Require -/// before MI. -bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI, +/// given a set of DemandedFields \p Used. +bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used, const VSETVLIInfo &Require, const VSETVLIInfo &CurInfo) const { - assert(Require == computeInfoForInstr(MI, MI.getDesc().TSFlags, *ST, LIS)); - if (!CurInfo.isValid() || CurInfo.isUnknown() || CurInfo.hasSEWLMULRatioOnly()) return true; - DemandedFields Used = getDemanded(MI, ST); - if (CurInfo.isCompatible(Used, Require, LIS)) return false; @@ -1232,16 +1228,17 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info, if (!RISCVII::hasSEWOp(TSFlags)) return; + DemandedFields Demanded = getDemanded(MI, ST); + const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, *ST, LIS); assert(NewInfo.isValid() && !NewInfo.isUnknown()); - if (Info.isValid() && !needVSETVLI(MI, NewInfo, Info)) + if (Info.isValid() && !needVSETVLI(Demanded, NewInfo, Info)) return; const VSETVLIInfo PrevInfo = Info; if (!Info.isValid() || Info.isUnknown()) Info = NewInfo; - DemandedFields Demanded = getDemanded(MI, ST); const VSETVLIInfo IncomingInfo = adjustIncoming(PrevInfo, NewInfo, Demanded); // If MI only demands that VL has the same zeroness, we only need to set the From 7ee511217b0d1cfd3269e9d2a89acf335ca9a9ea Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 29 May 2024 07:40:52 -0400 Subject: [PATCH 110/230] [gn build] Port 04f01a2b9ced --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 6bd56dd4117b03..8a5f6d1908784b 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -289,7 +289,6 @@ if (current_toolchain == default_toolchain) { "__atomic/kill_dependency.h", "__atomic/memory_order.h", "__atomic/to_gcc_order.h", - "__availability", "__bit/bit_cast.h", "__bit/bit_ceil.h", "__bit/bit_floor.h", @@ -385,7 +384,9 @@ if (current_toolchain == default_toolchain) { "__condition_variable/condition_variable.h", "__config", "__configuration/abi.h", + "__configuration/availability.h", "__configuration/compiler.h", + "__configuration/language.h", "__configuration/platform.h", "__coroutine/coroutine_handle.h", "__coroutine/coroutine_traits.h", From 9c4bae7c7c5be754f98bc495d51dd122609cd649 Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Wed, 29 May 2024 19:40:45 +0800 Subject: [PATCH 111/230] [X86][CodeGen] Disable NDD2NonNDD compression for CFCMOV --- llvm/lib/Target/X86/X86CompressEVEX.cpp | 3 ++- llvm/test/CodeGen/X86/apx/compress-evex.mir | 19 +++++++++++++++++++ .../TableGen/X86ManualCompressEVEXTables.def | 8 ++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index cadfda93d4b196..11b2155e3f985d 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -181,7 +181,8 @@ static bool isRedundantNewDataDest(MachineInstr &MI, const X86Subtarget &ST) { const MCInstrDesc &Desc = MI.getDesc(); Register Reg0 = MI.getOperand(0).getReg(); const MachineOperand &Op1 = MI.getOperand(1); - if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1) + if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 || + X86::isCFCMOVCC(MI.getOpcode())) return false; Register Reg1 = Op1.getReg(); if (Reg1 == Reg0) diff --git a/llvm/test/CodeGen/X86/apx/compress-evex.mir b/llvm/test/CodeGen/X86/apx/compress-evex.mir index 626904a7a692c1..5a59ab0f8a9d0b 100644 --- a/llvm/test/CodeGen/X86/apx/compress-evex.mir +++ b/llvm/test/CodeGen/X86/apx/compress-evex.mir @@ -108,3 +108,22 @@ body: | $rax = ADC64rr_ND $r16, $rdi, implicit-def dead $eflags, implicit $eflags RET64 $rax ... +--- +name: cfcmov_no_convert +body: | + bb.0.entry: + liveins: $eflags, $rax, $rbx + ; CHECK: cfcmovew %bx, %ax, %ax # encoding: [0x62,0xf4,0x7d,0x1c,0x44,0xc3] + ; CHECK: cfcmovsw 24(%rax), %bx, %bx # encoding: [0x62,0xf4,0x65,0x1c,0x48,0x58,0x18] + ; CHECK: cfcmovel %ebx, %eax, %eax # encoding: [0x62,0xf4,0x7c,0x1c,0x44,0xc3] + ; CHECK: cfcmovsl 24(%rax), %ebx, %ebx # encoding: [0x62,0xf4,0x64,0x1c,0x48,0x58,0x18] + ; CHECK: cfcmoveq %rbx, %rax, %rax # encoding: [0x62,0xf4,0xfc,0x1c,0x44,0xc3] + ; CHECK: cfcmovsq 24(%rax), %rbx, %rbx # encoding: [0x62,0xf4,0xe4,0x1c,0x48,0x58,0x18] + $ax = CFCMOV16rr_ND $ax, $bx, 4, implicit $eflags + $bx = CFCMOV16rm_ND $bx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags + $eax = CFCMOV32rr_ND $eax, $ebx, 4, implicit $eflags + $ebx = CFCMOV32rm_ND $ebx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags + $rax = CFCMOV64rr_ND $rax, $rbx, 4, implicit $eflags + $rbx = CFCMOV64rm_ND $rbx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags + RET64 $rax +... diff --git a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def index 665a394f57a6af..cab601bf8131f6 100644 --- a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def +++ b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def @@ -48,6 +48,14 @@ NOCOMP(VPSRAQZ256ri) NOCOMP(VPSRAQZ256rm) NOCOMP(VPSRAQZ256rr) NOCOMP(VSCALEFPSZ256rm) +// When condition evaluates to false, the destination register is zeroed for +// nonNDD CFCMOV but not for NDD CFCMOV. +NOCOMP(CFCMOV16rm_ND) +NOCOMP(CFCMOV16rr_ND) +NOCOMP(CFCMOV32rm_ND) +NOCOMP(CFCMOV32rr_ND) +NOCOMP(CFCMOV64rm_ND) +NOCOMP(CFCMOV64rr_ND) #undef NOCOMP #ifndef ENTRY From 35f2caf713489049cc1b31aa3fe0a054968f80e3 Mon Sep 17 00:00:00 2001 From: chuongg3 Date: Wed, 29 May 2024 13:14:03 +0100 Subject: [PATCH 112/230] [AArch64][GlobalISel] Select TBL/TBX Intrinsics (#92914) --- .../GISel/AArch64InstructionSelector.cpp | 45 + llvm/test/CodeGen/AArch64/arm64-tbl.ll | 1373 ++++++++++++----- 2 files changed, 1069 insertions(+), 349 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 3b3c1fc8b27bf1..4a7c82b393c10e 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -227,6 +227,8 @@ class AArch64InstructionSelector : public InstructionSelector { bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI); bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); + void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs, + unsigned Opc1, unsigned Opc2, bool isExt); bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI); bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI); @@ -6537,6 +6539,25 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, I.eraseFromParent(); return true; } + case Intrinsic::aarch64_neon_tbl2: + SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false); + return true; + case Intrinsic::aarch64_neon_tbl3: + SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three, + false); + return true; + case Intrinsic::aarch64_neon_tbl4: + SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false); + return true; + case Intrinsic::aarch64_neon_tbx2: + SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true); + return true; + case Intrinsic::aarch64_neon_tbx3: + SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true); + return true; + case Intrinsic::aarch64_neon_tbx4: + SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true); + return true; case Intrinsic::swift_async_context_addr: auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, {Register(AArch64::FP)}) @@ -6552,6 +6573,30 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, return false; } +void AArch64InstructionSelector::SelectTable(MachineInstr &I, + MachineRegisterInfo &MRI, + unsigned NumVec, unsigned Opc1, + unsigned Opc2, bool isExt) { + Register DstReg = I.getOperand(0).getReg(); + unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2; + + // Create the REG_SEQUENCE + SmallVector Regs; + for (unsigned i = 0; i < NumVec; i++) + Regs.push_back(I.getOperand(i + 2 + isExt).getReg()); + Register RegSeq = createQTuple(Regs, MIB); + + Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg(); + MachineInstrBuilder Instr; + if (isExt) { + Register Reg = I.getOperand(2).getReg(); + Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg}); + } else + Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg}); + constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI); + I.eraseFromParent(); +} + InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { auto MaybeImmed = getImmedFromMO(Root); diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll index 96b2af7274b5bf..44b92e6ccd088f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll @@ -1,28 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for tbl2_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl2_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl3_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl3_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl4_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl4_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_v8i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask2 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask2 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_shuffle -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask2 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx2_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx2_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx3_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx3_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx4_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx4_16b +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind { ; CHECK-LABEL: tbl1_8b: @@ -43,175 +21,378 @@ define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind { } define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) { -; CHECK-LABEL: tbl2_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl.8b v0, { v0, v1 }, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl2_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: tbl.8b v0, { v0, v1 }, v2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl2_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: tbl.8b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) ret <8 x i8> %tmp3 } define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { -; CHECK-LABEL: tbl2_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl2_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl2_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) ret <16 x i8> %tmp3 } define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) { -; CHECK-LABEL: tbl3_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: tbl.8b v0, { v0, v1, v2 }, v3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl3_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: tbl.8b v0, { v0, v1, v2 }, v3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl3_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: tbl.8b v0, { v0, v1, v2 }, v3 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) ret <8 x i8> %tmp3 } define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) { -; CHECK-LABEL: tbl3_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2 }, v3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl3_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2 }, v3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl3_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1, v2 }, v3 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) ret <16 x i8> %tmp3 } define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) { -; CHECK-LABEL: tbl4_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.8b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl4_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.8b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl4_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: tbl.8b v0, { v0, v1, v2, v3 }, v4 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) ret <8 x i8> %tmp3 } define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) { -; CHECK-LABEL: tbl4_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl4_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl4_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) ret <16 x i8> %tmp3 } -; CHECK-LABEL: .LCPI8_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 8 // 0x8 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff +; CHECK-SD-LABEL: .LCPI8_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 8 // 0x8 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff + +; CHECK-GI-LABEL: .LCPI8_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 13 // 0xd +; CHECK-GI-NEXT: .byte 14 // 0xe +; CHECK-GI-NEXT: .byte 15 // 0xf +; CHECK-GI-LABEL: .LCPI8_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_v8i8: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: tbl.8b v0, { v0, v1 }, v4 -; CHECK-NEXT: tbl.8b v1, { v2, v3 }, v4 -; CHECK-NEXT: mov.s v0[1], v1[1] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_v8i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI8_0 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: ldr d4, [x8, :lo12:.LCPI8_0] +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: tbl.8b v0, { v0, v1 }, v4 +; CHECK-SD-NEXT: tbl.8b v1, { v2, v3 }, v4 +; CHECK-SD-NEXT: mov.s v0[1], v1[1] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_v8i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI8_1 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr d4, [x8, :lo12:.LCPI8_1] +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: adrp x8, .LCPI8_0 +; CHECK-GI-NEXT: tbl.8b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.8b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: mov.d v0[1], v1[0] +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI8_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> ) %t2 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %c, <16 x i8> %d, <8 x i8> ) %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <8 x i32> ret <8 x i8> %s } -; CHECK-LABEL: .LCPI9_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 8 // 0x8 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 16 // 0x10 -; CHECK-NEXT: .byte 20 // 0x14 -; CHECK-NEXT: .byte 24 // 0x18 -; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 32 // 0x20 -; CHECK-NEXT: .byte 36 // 0x24 -; CHECK-NEXT: .byte 40 // 0x28 -; CHECK-NEXT: .byte 44 // 0x2c -; CHECK-NEXT: .byte 48 // 0x30 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 56 // 0x38 -; CHECK-NEXT: .byte 60 // 0x3c +; CHECK-SD-LABEL: .LCPI9_0: +; CHECK-SD-NEXT: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 8 // 0x8 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 32 // 0x20 +; CHECK-SD-NEXT: .byte 36 // 0x24 +; CHECK-SD-NEXT: .byte 40 // 0x28 +; CHECK-SD-NEXT: .byte 44 // 0x2c +; CHECK-SD-NEXT: .byte 48 // 0x30 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 56 // 0x38 +; CHECK-SD-NEXT: .byte 60 // 0x3c + +;CHECK-GI-LABEL: .LCPI9_0: +;CHECK-GI: .byte 0 // 0x0 +;CHECK-GI-NEXT: .byte 1 // 0x1 +;CHECK-GI-NEXT: .byte 2 // 0x2 +;CHECK-GI-NEXT: .byte 3 // 0x3 +;CHECK-GI-NEXT: .byte 4 // 0x4 +;CHECK-GI-NEXT: .byte 5 // 0x5 +;CHECK-GI-NEXT: .byte 6 // 0x6 +;CHECK-GI-NEXT: .byte 7 // 0x7 +;CHECK-GI-NEXT: .byte 16 // 0x10 +;CHECK-GI-NEXT: .byte 17 // 0x11 +;CHECK-GI-NEXT: .byte 18 // 0x12 +;CHECK-GI-NEXT: .byte 19 // 0x13 +;CHECK-GI-NEXT: .byte 20 // 0x14 +;CHECK-GI-NEXT: .byte 21 // 0x15 +;CHECK-GI-NEXT: .byte 22 // 0x16 +;CHECK-GI-NEXT: .byte 23 // 0x17 +;CHECK-GI-LABEL: .LCPI9_1: +;CHECK-GI: .byte 0 // 0x0 +;CHECK-GI-NEXT: .byte 4 // 0x4 +;CHECK-GI-NEXT: .byte 8 // 0x8 +;CHECK-GI-NEXT: .byte 12 // 0xc +;CHECK-GI-NEXT: .byte 16 // 0x10 +;CHECK-GI-NEXT: .byte 20 // 0x14 +;CHECK-GI-NEXT: .byte 24 // 0x18 +;CHECK-GI-NEXT: .byte 28 // 0x1c +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: adrp x8, .LCPI9_0 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI9_0] +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI9_1 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI9_1] +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: adrp x8, .LCPI9_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI9_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> ret <16 x i8> %s } +; CHECK-GI-LABEL: .LCPI10_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI10_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff + define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov s4, w0 -; CHECK-NEXT: mov w8, #32 // =0x20 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: mov.b v4[1], w0 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: mov.b v4[2], w0 -; CHECK-NEXT: mov.b v4[3], w0 -; CHECK-NEXT: mov.b v4[4], w0 -; CHECK-NEXT: mov.b v4[5], w0 -; CHECK-NEXT: mov.b v4[6], w0 -; CHECK-NEXT: mov.b v4[7], w0 -; CHECK-NEXT: mov.b v4[8], w8 -; CHECK-NEXT: mov w8, #36 // =0x24 -; CHECK-NEXT: mov.b v4[9], w8 -; CHECK-NEXT: mov w8, #40 // =0x28 -; CHECK-NEXT: mov.b v4[10], w8 -; CHECK-NEXT: mov w8, #44 // =0x2c -; CHECK-NEXT: mov.b v4[11], w8 -; CHECK-NEXT: mov w8, #48 // =0x30 -; CHECK-NEXT: mov.b v4[12], w8 -; CHECK-NEXT: mov w8, #52 // =0x34 -; CHECK-NEXT: mov.b v4[13], w8 -; CHECK-NEXT: mov w8, #56 // =0x38 -; CHECK-NEXT: mov.b v4[14], w8 -; CHECK-NEXT: mov w8, #60 // =0x3c -; CHECK-NEXT: mov.b v4[15], w8 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s4, w0 +; CHECK-SD-NEXT: mov w8, #32 // =0x20 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: mov.b v4[1], w0 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: mov.b v4[2], w0 +; CHECK-SD-NEXT: mov.b v4[3], w0 +; CHECK-SD-NEXT: mov.b v4[4], w0 +; CHECK-SD-NEXT: mov.b v4[5], w0 +; CHECK-SD-NEXT: mov.b v4[6], w0 +; CHECK-SD-NEXT: mov.b v4[7], w0 +; CHECK-SD-NEXT: mov.b v4[8], w8 +; CHECK-SD-NEXT: mov w8, #36 // =0x24 +; CHECK-SD-NEXT: mov.b v4[9], w8 +; CHECK-SD-NEXT: mov w8, #40 // =0x28 +; CHECK-SD-NEXT: mov.b v4[10], w8 +; CHECK-SD-NEXT: mov w8, #44 // =0x2c +; CHECK-SD-NEXT: mov.b v4[11], w8 +; CHECK-SD-NEXT: mov w8, #48 // =0x30 +; CHECK-SD-NEXT: mov.b v4[12], w8 +; CHECK-SD-NEXT: mov w8, #52 // =0x34 +; CHECK-SD-NEXT: mov.b v4[13], w8 +; CHECK-SD-NEXT: mov w8, #56 // =0x38 +; CHECK-SD-NEXT: mov.b v4[14], w8 +; CHECK-SD-NEXT: mov w8, #60 // =0x3c +; CHECK-SD-NEXT: mov.b v4[15], w8 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s4, w0 +; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: mov.16b v5, v4 +; CHECK-GI-NEXT: mov.b v5[1], v4[0] +; CHECK-GI-NEXT: mov.b v5[2], v4[0] +; CHECK-GI-NEXT: mov.b v5[3], v4[0] +; CHECK-GI-NEXT: mov.b v5[4], v4[0] +; CHECK-GI-NEXT: mov.b v5[5], v4[0] +; CHECK-GI-NEXT: mov.b v5[6], v4[0] +; CHECK-GI-NEXT: mov.b v5[7], v4[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI10_1 +; CHECK-GI-NEXT: mov.b v5[8], v4[0] +; CHECK-GI-NEXT: mov.b v5[9], v4[0] +; CHECK-GI-NEXT: mov.b v5[10], v4[0] +; CHECK-GI-NEXT: mov.b v5[11], v4[0] +; CHECK-GI-NEXT: mov.b v5[12], v4[0] +; CHECK-GI-NEXT: mov.b v5[13], v4[0] +; CHECK-GI-NEXT: mov.b v5[14], v4[0] +; CHECK-GI-NEXT: mov.b v5[15], v4[0] +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI10_1] +; CHECK-GI-NEXT: adrp x8, .LCPI10_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v5 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI10_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 @@ -234,40 +415,111 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x ret <16 x i8> %s } +; CHECK-GI-LABEL: .LCPI11_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 15 // 0xf +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 31 // 0x1f +; CHECK-GI-LABEL: .LCPI11_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff + define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: mov.b v4[1], w8 -; CHECK-NEXT: mov.b v4[2], w8 -; CHECK-NEXT: mov.b v4[3], w8 -; CHECK-NEXT: mov.b v4[4], w8 -; CHECK-NEXT: mov.b v4[5], w8 -; CHECK-NEXT: mov.b v4[6], w8 -; CHECK-NEXT: mov w8, #32 // =0x20 -; CHECK-NEXT: mov.b v4[7], w0 -; CHECK-NEXT: mov.b v4[8], w8 -; CHECK-NEXT: mov w8, #36 // =0x24 -; CHECK-NEXT: mov.b v4[9], w8 -; CHECK-NEXT: mov w8, #40 // =0x28 -; CHECK-NEXT: mov.b v4[10], w8 -; CHECK-NEXT: mov w8, #44 // =0x2c -; CHECK-NEXT: mov.b v4[11], w8 -; CHECK-NEXT: mov w8, #48 // =0x30 -; CHECK-NEXT: mov.b v4[12], w8 -; CHECK-NEXT: mov w8, #52 // =0x34 -; CHECK-NEXT: mov.b v4[13], w8 -; CHECK-NEXT: mov w8, #56 // =0x38 -; CHECK-NEXT: mov.b v4[14], w8 -; CHECK-NEXT: mov w8, #31 // =0x1f -; CHECK-NEXT: mov.b v4[15], w8 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #1 // =0x1 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: fmov s4, w8 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: mov.b v4[1], w8 +; CHECK-SD-NEXT: mov.b v4[2], w8 +; CHECK-SD-NEXT: mov.b v4[3], w8 +; CHECK-SD-NEXT: mov.b v4[4], w8 +; CHECK-SD-NEXT: mov.b v4[5], w8 +; CHECK-SD-NEXT: mov.b v4[6], w8 +; CHECK-SD-NEXT: mov w8, #32 // =0x20 +; CHECK-SD-NEXT: mov.b v4[7], w0 +; CHECK-SD-NEXT: mov.b v4[8], w8 +; CHECK-SD-NEXT: mov w8, #36 // =0x24 +; CHECK-SD-NEXT: mov.b v4[9], w8 +; CHECK-SD-NEXT: mov w8, #40 // =0x28 +; CHECK-SD-NEXT: mov.b v4[10], w8 +; CHECK-SD-NEXT: mov w8, #44 // =0x2c +; CHECK-SD-NEXT: mov.b v4[11], w8 +; CHECK-SD-NEXT: mov w8, #48 // =0x30 +; CHECK-SD-NEXT: mov.b v4[12], w8 +; CHECK-SD-NEXT: mov w8, #52 // =0x34 +; CHECK-SD-NEXT: mov.b v4[13], w8 +; CHECK-SD-NEXT: mov w8, #56 // =0x38 +; CHECK-SD-NEXT: mov.b v4[14], w8 +; CHECK-SD-NEXT: mov w8, #31 // =0x1f +; CHECK-SD-NEXT: mov.b v4[15], w8 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: fmov s6, w0 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: mov.16b v5, v4 +; CHECK-GI-NEXT: mov.b v5[1], v4[0] +; CHECK-GI-NEXT: mov.b v5[2], v4[0] +; CHECK-GI-NEXT: mov.b v5[3], v4[0] +; CHECK-GI-NEXT: mov.b v5[4], v4[0] +; CHECK-GI-NEXT: mov.b v5[5], v4[0] +; CHECK-GI-NEXT: mov.b v5[6], v4[0] +; CHECK-GI-NEXT: mov.b v5[7], v4[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI11_1 +; CHECK-GI-NEXT: mov.b v5[8], v4[0] +; CHECK-GI-NEXT: mov.b v5[9], v4[0] +; CHECK-GI-NEXT: mov.b v5[10], v4[0] +; CHECK-GI-NEXT: mov.b v5[11], v4[0] +; CHECK-GI-NEXT: mov.b v5[12], v6[0] +; CHECK-GI-NEXT: mov.b v5[13], v6[0] +; CHECK-GI-NEXT: mov.b v5[14], v4[0] +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI11_1] +; CHECK-GI-NEXT: adrp x8, .LCPI11_0 +; CHECK-GI-NEXT: mov.b v5[15], v6[0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v5 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0 %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1 %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2 @@ -290,29 +542,116 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x ret <16 x i8> %s } +; CHECK-SD-LABEL: .LCPI12_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 8 // 0x8 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff + +; CHECK-GI-LABEL: .LCPI12_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI12_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff + define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v4, #0xffffffffffffffff -; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI12_0] -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5 -; CHECK-NEXT: mov.b v4[0], w0 -; CHECK-NEXT: mov.b v4[1], w0 -; CHECK-NEXT: mov.b v4[2], w0 -; CHECK-NEXT: mov.b v4[3], w0 -; CHECK-NEXT: mov.b v4[4], w0 -; CHECK-NEXT: mov.b v4[5], w0 -; CHECK-NEXT: mov.b v4[6], w0 -; CHECK-NEXT: mov.b v4[7], w0 -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4 -; CHECK-NEXT: mov.d v2[1], v0[0] -; CHECK-NEXT: mov.16b v0, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi.2d v4, #0xffffffffffffffff +; CHECK-SD-NEXT: adrp x8, .LCPI12_0 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: ldr q5, [x8, :lo12:.LCPI12_0] +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: tbl.16b v2, { v2, v3 }, v5 +; CHECK-SD-NEXT: mov.b v4[0], w0 +; CHECK-SD-NEXT: mov.b v4[1], w0 +; CHECK-SD-NEXT: mov.b v4[2], w0 +; CHECK-SD-NEXT: mov.b v4[3], w0 +; CHECK-SD-NEXT: mov.b v4[4], w0 +; CHECK-SD-NEXT: mov.b v4[5], w0 +; CHECK-SD-NEXT: mov.b v4[6], w0 +; CHECK-SD-NEXT: mov.b v4[7], w0 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-SD-NEXT: mov.d v2[1], v0[0] +; CHECK-SD-NEXT: mov.16b v0, v2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s4, w0 +; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: mov.16b v5, v4 +; CHECK-GI-NEXT: mov.b v5[1], v4[0] +; CHECK-GI-NEXT: mov.b v5[2], v4[0] +; CHECK-GI-NEXT: mov.b v5[3], v4[0] +; CHECK-GI-NEXT: mov.b v5[4], v4[0] +; CHECK-GI-NEXT: mov.b v5[5], v4[0] +; CHECK-GI-NEXT: mov.b v5[6], v4[0] +; CHECK-GI-NEXT: mov.b v5[7], v4[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI12_1 +; CHECK-GI-NEXT: mov.b v5[8], v4[0] +; CHECK-GI-NEXT: mov.b v5[9], v4[0] +; CHECK-GI-NEXT: mov.b v5[10], v4[0] +; CHECK-GI-NEXT: mov.b v5[11], v4[0] +; CHECK-GI-NEXT: mov.b v5[12], v4[0] +; CHECK-GI-NEXT: mov.b v5[13], v4[0] +; CHECK-GI-NEXT: mov.b v5[14], v4[0] +; CHECK-GI-NEXT: mov.b v5[15], v4[0] +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI12_1] +; CHECK-GI-NEXT: adrp x8, .LCPI12_0 +; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v4 +; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v5 +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI12_0] +; CHECK-GI-NEXT: tbl.16b v0, { v2, v3 }, v0 +; CHECK-GI-NEXT: ret %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 @@ -335,29 +674,133 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x ret <16 x i8> %s } +; CHECK-SD-LABEL: .LCPI13_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 8 // 0x8 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-LABEL: .LCPI13_1: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 1 // 0x1 +; CHECK-SD-NEXT: .byte 2 // 0x2 +; CHECK-SD-NEXT: .byte 3 // 0x3 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 5 // 0x5 +; CHECK-SD-NEXT: .byte 6 // 0x6 +; CHECK-SD-NEXT: .byte 7 // 0x7 +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 17 // 0x11 +; CHECK-SD-NEXT: .byte 18 // 0x12 +; CHECK-SD-NEXT: .byte 19 // 0x13 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 21 // 0x15 +; CHECK-SD-NEXT: .byte 30 // 0x1e +; CHECK-SD-NEXT: .byte 31 // 0x1f + +; CHECK-GI-LABEL: .LCPI13_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 30 // 0x1e +; CHECK-GI-NEXT: .byte 31 // 0x1f +; CHECK-GI-LABEL: .LCPI13_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff + define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2: -; CHECK: // %bb.0: -; CHECK-NEXT: dup.16b v4, w0 -; CHECK-NEXT: mov w8, #255 // =0xff -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: mov.b v4[8], w8 -; CHECK-NEXT: mov.b v4[9], w8 -; CHECK-NEXT: mov.b v4[10], w8 -; CHECK-NEXT: mov.b v4[11], w8 -; CHECK-NEXT: mov.b v4[12], w8 -; CHECK-NEXT: mov.b v4[13], w8 -; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: adrp x8, .LCPI13_1 -; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5 -; CHECK-NEXT: tbl.16b v3, { v0, v1 }, v4 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI13_1] -; CHECK-NEXT: tbl.16b v0, { v2, v3 }, v0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: dup.16b v4, w0 +; CHECK-SD-NEXT: mov w8, #255 // =0xff +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: mov.b v4[8], w8 +; CHECK-SD-NEXT: mov.b v4[9], w8 +; CHECK-SD-NEXT: mov.b v4[10], w8 +; CHECK-SD-NEXT: mov.b v4[11], w8 +; CHECK-SD-NEXT: mov.b v4[12], w8 +; CHECK-SD-NEXT: mov.b v4[13], w8 +; CHECK-SD-NEXT: adrp x8, .LCPI13_0 +; CHECK-SD-NEXT: ldr q5, [x8, :lo12:.LCPI13_0] +; CHECK-SD-NEXT: adrp x8, .LCPI13_1 +; CHECK-SD-NEXT: tbl.16b v2, { v2, v3 }, v5 +; CHECK-SD-NEXT: tbl.16b v3, { v0, v1 }, v4 +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI13_1] +; CHECK-SD-NEXT: tbl.16b v0, { v2, v3 }, v0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s4, w0 +; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: fmov s6, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI13_1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: mov.16b v5, v4 +; CHECK-GI-NEXT: mov.b v5[1], v4[0] +; CHECK-GI-NEXT: mov.b v5[2], v4[0] +; CHECK-GI-NEXT: mov.b v5[3], v4[0] +; CHECK-GI-NEXT: mov.b v5[4], v4[0] +; CHECK-GI-NEXT: mov.b v5[5], v4[0] +; CHECK-GI-NEXT: mov.b v5[6], v4[0] +; CHECK-GI-NEXT: mov.b v5[7], v4[0] +; CHECK-GI-NEXT: mov.b v5[8], v6[0] +; CHECK-GI-NEXT: mov.b v5[9], v6[0] +; CHECK-GI-NEXT: mov.b v5[10], v6[0] +; CHECK-GI-NEXT: mov.b v5[11], v6[0] +; CHECK-GI-NEXT: mov.b v5[12], v6[0] +; CHECK-GI-NEXT: mov.b v5[13], v6[0] +; CHECK-GI-NEXT: mov.b v5[14], v4[0] +; CHECK-GI-NEXT: mov.b v5[15], v4[0] +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI13_1] +; CHECK-GI-NEXT: adrp x8, .LCPI13_0 +; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v4 +; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v5 +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI13_0] +; CHECK-GI-NEXT: tbl.16b v0, { v2, v3 }, v0 +; CHECK-GI-NEXT: ret %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 @@ -380,106 +823,293 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 ret <16 x i8> %s } +; CHECK-SD-LABEL: .LCPI14_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 32 // 0x20 +; CHECK-SD-NEXT: .byte 36 // 0x24 +; CHECK-SD-NEXT: .byte 40 // 0x28 +; CHECK-SD-NEXT: .byte 44 // 0x2c +; CHECK-SD-NEXT: .byte 48 // 0x30 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 56 // 0x38 +; CHECK-SD-NEXT: .byte 60 // 0x3c -; CHECK-LABEL: .LCPI14_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 16 // 0x10 -; CHECK-NEXT: .byte 20 // 0x14 -; CHECK-NEXT: .byte 24 // 0x18 -; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 32 // 0x20 -; CHECK-NEXT: .byte 36 // 0x24 -; CHECK-NEXT: .byte 40 // 0x28 -; CHECK-NEXT: .byte 44 // 0x2c -; CHECK-NEXT: .byte 48 // 0x30 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 56 // 0x38 -; CHECK-NEXT: .byte 60 // 0x3c +; CHECK-GI-LABEL: .LCPI14_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI14_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: adrp x8, .LCPI14_0 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI14_0] +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI14_1 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI14_1] +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: adrp x8, .LCPI14_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> ret <16 x i8> %s } -; CHECK-LABEL: .LCPI15_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 16 // 0x10 -; CHECK-NEXT: .byte 20 // 0x14 -; CHECK-NEXT: .byte 24 // 0x18 -; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 32 // 0x20 -; CHECK-NEXT: .byte 36 // 0x24 -; CHECK-NEXT: .byte 40 // 0x28 -; CHECK-NEXT: .byte 44 // 0x2c -; CHECK-NEXT: .byte 48 // 0x30 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 56 // 0x38 -; CHECK-NEXT: .byte 60 // 0x3c +; CHECK-SD-LABEL: .LCPI15_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 32 // 0x20 +; CHECK-SD-NEXT: .byte 36 // 0x24 +; CHECK-SD-NEXT: .byte 40 // 0x28 +; CHECK-SD-NEXT: .byte 44 // 0x2c +; CHECK-SD-NEXT: .byte 48 // 0x30 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 56 // 0x38 +; CHECK-SD-NEXT: .byte 60 // 0x3c + +; CHECK-GI-LABEL: .LCPI15_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI15_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-LABEL: .LCPI15_2: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: adrp x8, .LCPI15_0 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI15_0] +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI15_2 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI15_2] +; CHECK-GI-NEXT: adrp x8, .LCPI15_1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI15_1] +; CHECK-GI-NEXT: adrp x8, .LCPI15_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v5 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> ret <16 x i8> %s } -; CHECK-LABEL: .LCPI16_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 16 // 0x10 -; CHECK-NEXT: .byte 20 // 0x14 -; CHECK-NEXT: .byte 24 // 0x18 -; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 32 // 0x20 -; CHECK-NEXT: .byte 36 // 0x24 -; CHECK-NEXT: .byte 40 // 0x28 -; CHECK-NEXT: .byte 44 // 0x2c -; CHECK-NEXT: .byte 48 // 0x30 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 56 // 0x38 -; CHECK-NEXT: .byte 60 // 0x3c +; CHECK-SD-LABEL: .LCPI16_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 32 // 0x20 +; CHECK-SD-NEXT: .byte 36 // 0x24 +; CHECK-SD-NEXT: .byte 40 // 0x28 +; CHECK-SD-NEXT: .byte 44 // 0x2c +; CHECK-SD-NEXT: .byte 48 // 0x30 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 56 // 0x38 +; CHECK-SD-NEXT: .byte 60 // 0x3c + +; CHECK-GI-LABEL: .LCPI16_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI16_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-LABEL: .LCPI16_2: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: adrp x8, .LCPI16_0 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_0] +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI16_2 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI16_2] +; CHECK-GI-NEXT: adrp x8, .LCPI16_1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI16_1] +; CHECK-GI-NEXT: adrp x8, .LCPI16_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v5 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI16_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> @@ -514,73 +1144,121 @@ define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind { } define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) { -; CHECK-LABEL: tbx2_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 -; CHECK-NEXT: tbx.8b v0, { v1, v2 }, v3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx2_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 +; CHECK-SD-NEXT: tbx.8b v0, { v1, v2 }, v3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx2_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 +; CHECK-GI-NEXT: tbx.8b v0, { v1, v2 }, v3 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) ret <8 x i8> %tmp3 } define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) { -; CHECK-LABEL: tbx2_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 -; CHECK-NEXT: tbx.16b v0, { v1, v2 }, v3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx2_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 +; CHECK-SD-NEXT: tbx.16b v0, { v1, v2 }, v3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx2_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 +; CHECK-GI-NEXT: tbx.16b v0, { v1, v2 }, v3 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) ret <16 x i8> %tmp3 } define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) { -; CHECK-LABEL: tbx3_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: tbx.8b v0, { v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx3_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: tbx.8b v0, { v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx3_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: tbx.8b v0, { v1, v2, v3 }, v4 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) ret <8 x i8> %tmp3 } define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) { -; CHECK-LABEL: tbx3_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: tbx.16b v0, { v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx3_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: tbx.16b v0, { v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx3_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: tbx.16b v0, { v1, v2, v3 }, v4 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) ret <16 x i8> %tmp3 } define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) { -; CHECK-LABEL: tbx4_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: tbx.8b v0, { v1, v2, v3, v4 }, v5 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx4_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: tbx.8b v0, { v1, v2, v3, v4 }, v5 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx4_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: tbx.8b v0, { v1, v2, v3, v4 }, v5 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) ret <8 x i8> %tmp3 } define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) { -; CHECK-LABEL: tbx4_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: tbx.16b v0, { v1, v2, v3, v4 }, v5 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx4_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: tbx.16b v0, { v1, v2, v3, v4 }, v5 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx4_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: tbx.16b v0, { v1, v2, v3, v4 }, v5 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) ret <16 x i8> %tmp3 } @@ -594,6 +1272,3 @@ declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-GI: {{.*}} -; CHECK-SD: {{.*}} From 8e1290432adf33a7aeca65a53d1faa7577ed0e66 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 29 May 2024 14:19:49 +0200 Subject: [PATCH 113/230] [lldb/DWARF] Refactor DWARFDIE::Get{Decl,TypeLookup}Context (#93291) After a bug (the bug is that the functions don't handle DW_AT_signature, aka type units) led me to one of these similar-but-different functions, I started to realize that most of the differences between these two functions are actually bugs. As a first step towards merging them, this patch rewrites both of them to follow the same pattern, while preserving all of their differences. The main change is that GetTypeLookupContext now also uses a `seen` list to avoid reference loops (currently that's not necessary because the function strictly follows parent links, but that will change with DW_AT_signatures). I've also optimized both functions to avoid recursion by starting contruction with the deepest scope first (and then reversing it). --- lldb/include/lldb/Symbol/Type.h | 2 + .../Plugins/SymbolFile/DWARF/DWARFDIE.cpp | 197 +++++++++--------- lldb/source/Symbol/Type.cpp | 7 + .../SymbolFile/DWARF/DWARFDIETest.cpp | 71 +++++++ 4 files changed, 184 insertions(+), 93 deletions(-) diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h index 7aa0852676e465..c6f30cde818674 100644 --- a/lldb/include/lldb/Symbol/Type.h +++ b/lldb/include/lldb/Symbol/Type.h @@ -62,6 +62,8 @@ struct CompilerContext { CompilerContextKind kind; ConstString name; }; +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, + const CompilerContext &rhs); /// Match \p context_chain against \p pattern, which may contain "Any" /// kinds. The \p context_chain should *not* contain any "Any" kinds. diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp index 4884374ef94729..03e289bbf33005 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp @@ -13,6 +13,7 @@ #include "DWARFDebugInfoEntry.h" #include "DWARFDeclContext.h" #include "DWARFUnit.h" +#include "lldb/Symbol/Type.h" #include "llvm/ADT/iterator.h" @@ -379,108 +380,118 @@ std::vector DWARFDIE::GetDeclContextDIEs() const { return result; } -static std::vector -GetDeclContextImpl(llvm::SmallSet &seen, DWARFDIE die) { - std::vector context; +static void GetDeclContextImpl(DWARFDIE die, + llvm::SmallSet &seen, + std::vector &context) { // Stop if we hit a cycle. - if (!die || !seen.insert(die.GetID()).second) - return context; - - // Handle outline member function DIEs by following the specification. - if (DWARFDIE spec = die.GetReferencedDIE(DW_AT_specification)) - return GetDeclContextImpl(seen, spec); - - // Get the parent context chain. - context = GetDeclContextImpl(seen, die.GetParent()); + while (die && seen.insert(die.GetID()).second) { + // Handle outline member function DIEs by following the specification. + if (DWARFDIE spec = die.GetReferencedDIE(DW_AT_specification)) { + die = spec; + continue; + } - // Add this DIE's contribution at the end of the chain. - auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) { - context.push_back({kind, ConstString(name)}); - }; - switch (die.Tag()) { - case DW_TAG_module: - push_ctx(CompilerContextKind::Module, die.GetName()); - break; - case DW_TAG_namespace: - push_ctx(CompilerContextKind::Namespace, die.GetName()); - break; - case DW_TAG_structure_type: - push_ctx(CompilerContextKind::Struct, die.GetName()); - break; - case DW_TAG_union_type: - push_ctx(CompilerContextKind::Union, die.GetName()); - break; - case DW_TAG_class_type: - push_ctx(CompilerContextKind::Class, die.GetName()); - break; - case DW_TAG_enumeration_type: - push_ctx(CompilerContextKind::Enum, die.GetName()); - break; - case DW_TAG_subprogram: - push_ctx(CompilerContextKind::Function, die.GetName()); - break; - case DW_TAG_variable: - push_ctx(CompilerContextKind::Variable, die.GetPubname()); - break; - case DW_TAG_typedef: - push_ctx(CompilerContextKind::Typedef, die.GetName()); - break; - default: - break; + // Add this DIE's contribution at the end of the chain. + auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) { + context.push_back({kind, ConstString(name)}); + }; + switch (die.Tag()) { + case DW_TAG_module: + push_ctx(CompilerContextKind::Module, die.GetName()); + break; + case DW_TAG_namespace: + push_ctx(CompilerContextKind::Namespace, die.GetName()); + break; + case DW_TAG_structure_type: + push_ctx(CompilerContextKind::Struct, die.GetName()); + break; + case DW_TAG_union_type: + push_ctx(CompilerContextKind::Union, die.GetName()); + break; + case DW_TAG_class_type: + push_ctx(CompilerContextKind::Class, die.GetName()); + break; + case DW_TAG_enumeration_type: + push_ctx(CompilerContextKind::Enum, die.GetName()); + break; + case DW_TAG_subprogram: + push_ctx(CompilerContextKind::Function, die.GetName()); + break; + case DW_TAG_variable: + push_ctx(CompilerContextKind::Variable, die.GetPubname()); + break; + case DW_TAG_typedef: + push_ctx(CompilerContextKind::Typedef, die.GetName()); + break; + default: + break; + } + // Now process the parent. + die = die.GetParent(); } - return context; } -std::vector DWARFDIE::GetDeclContext() const { +std::vector DWARFDIE::GetDeclContext() const { llvm::SmallSet seen; - return GetDeclContextImpl(seen, *this); + std::vector context; + GetDeclContextImpl(*this, seen, context); + std::reverse(context.begin(), context.end()); + return context; } -std::vector -DWARFDIE::GetTypeLookupContext() const { - std::vector context; - // If there is no name, then there is no need to look anything up for this - // DIE. - const char *name = GetName(); - if (!name || !name[0]) - return context; - const dw_tag_t tag = Tag(); - if (tag == DW_TAG_compile_unit || tag == DW_TAG_partial_unit) - return context; - DWARFDIE parent = GetParent(); - if (parent) - context = parent.GetTypeLookupContext(); - auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) { - context.push_back({kind, ConstString(name)}); - }; - switch (tag) { - case DW_TAG_namespace: - push_ctx(CompilerContextKind::Namespace, name); - break; - case DW_TAG_structure_type: - push_ctx(CompilerContextKind::Struct, name); - break; - case DW_TAG_union_type: - push_ctx(CompilerContextKind::Union, name); - break; - case DW_TAG_class_type: - push_ctx(CompilerContextKind::Class, name); - break; - case DW_TAG_enumeration_type: - push_ctx(CompilerContextKind::Enum, name); - break; - case DW_TAG_variable: - push_ctx(CompilerContextKind::Variable, GetPubname()); - break; - case DW_TAG_typedef: - push_ctx(CompilerContextKind::Typedef, name); - break; - case DW_TAG_base_type: - push_ctx(CompilerContextKind::Builtin, name); - break; - default: - break; +static void GetTypeLookupContextImpl(DWARFDIE die, + llvm::SmallSet &seen, + std::vector &context) { + // Stop if we hit a cycle. + while (die && seen.insert(die.GetID()).second) { + // If there is no name, then there is no need to look anything up for this + // DIE. + const char *name = die.GetName(); + if (!name || !name[0]) + return; + + // Add this DIE's contribution at the end of the chain. + auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) { + context.push_back({kind, ConstString(name)}); + }; + switch (die.Tag()) { + case DW_TAG_namespace: + push_ctx(CompilerContextKind::Namespace, die.GetName()); + break; + case DW_TAG_structure_type: + push_ctx(CompilerContextKind::Struct, die.GetName()); + break; + case DW_TAG_union_type: + push_ctx(CompilerContextKind::Union, die.GetName()); + break; + case DW_TAG_class_type: + push_ctx(CompilerContextKind::Class, die.GetName()); + break; + case DW_TAG_enumeration_type: + push_ctx(CompilerContextKind::Enum, die.GetName()); + break; + case DW_TAG_variable: + push_ctx(CompilerContextKind::Variable, die.GetPubname()); + break; + case DW_TAG_typedef: + push_ctx(CompilerContextKind::Typedef, die.GetName()); + break; + case DW_TAG_base_type: + push_ctx(CompilerContextKind::Builtin, name); + break; + default: + break; + } + // Now process the parent. + die = die.GetParent(); } +} + +std::vector DWARFDIE::GetTypeLookupContext() const { + llvm::SmallSet seen; + std::vector context; + GetTypeLookupContextImpl(*this, seen, context); + std::reverse(context.begin(), context.end()); return context; } diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp index 6bf69c2ded2874..585808ace15ce8 100644 --- a/lldb/source/Symbol/Type.cpp +++ b/lldb/source/Symbol/Type.cpp @@ -36,6 +36,13 @@ using namespace lldb; using namespace lldb_private; +llvm::raw_ostream &lldb_private::operator<<(llvm::raw_ostream &os, + const CompilerContext &rhs) { + StreamString lldb_stream; + rhs.Dump(lldb_stream); + return os << lldb_stream.GetString(); +} + bool lldb_private::contextMatches(llvm::ArrayRef context_chain, llvm::ArrayRef pattern) { auto ctx = context_chain.begin(); diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp index 20742ea5123091..bea07dfa27cc6a 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp @@ -10,6 +10,8 @@ #include "Plugins/SymbolFile/DWARF/DWARFDebugInfo.h" #include "TestingSupport/Symbol/YAMLModuleTester.h" #include "lldb/Core/dwarf.h" +#include "lldb/Symbol/Type.h" +#include "lldb/lldb-private-enumerations.h" #include "llvm/ADT/STLExtras.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -187,3 +189,72 @@ TEST(DWARFDIETest, PeekName) { dw_offset_t fifth_die_offset = 26; EXPECT_EQ(unit->PeekDIEName(fifth_die_offset), "NameType2"); } + +TEST(DWARFDIETest, GetContext) { + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_386 +DWARF: + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_namespace + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Code: 0x3 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x1 + Values: + - Value: 0x000000000000000C + - AbbrCode: 0x2 + Values: + - CStr: NAMESPACE + - AbbrCode: 0x3 + Values: + - CStr: STRUCT + - AbbrCode: 0x0 + - AbbrCode: 0x0 +)"; + + YAMLModuleTester t(yamldata); + auto *symbol_file = + llvm::cast(t.GetModule()->GetSymbolFile()); + DWARFUnit *unit = symbol_file->DebugInfo().GetUnitAtIndex(0); + ASSERT_TRUE(unit); + + auto make_namespace = [](llvm::StringRef name) { + return CompilerContext(CompilerContextKind::Namespace, ConstString(name)); + }; + auto make_struct = [](llvm::StringRef name) { + return CompilerContext(CompilerContextKind::Struct, ConstString(name)); + }; + DWARFDIE struct_die = unit->DIE().GetFirstChild().GetFirstChild(); + ASSERT_TRUE(struct_die); + EXPECT_THAT( + struct_die.GetDeclContext(), + testing::ElementsAre(make_namespace("NAMESPACE"), make_struct("STRUCT"))); + EXPECT_THAT( + struct_die.GetTypeLookupContext(), + testing::ElementsAre(make_namespace("NAMESPACE"), make_struct("STRUCT"))); +} From 3ce9b86cfd2d88162bc4a551dd7910b8cff3097b Mon Sep 17 00:00:00 2001 From: Tuan Chuong Goh Date: Wed, 29 May 2024 12:23:02 +0000 Subject: [PATCH 114/230] [AArch64][NFC] Pre-commit Test for Combine MUL(AND(LSHR)) to CMLTz (#92915) --- llvm/test/CodeGen/AArch64/mulcmle.ll | 135 +++++++++++++++++++++------ 1 file changed, 105 insertions(+), 30 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/mulcmle.ll b/llvm/test/CodeGen/AArch64/mulcmle.ll index 5c216b85500801..b22c75259adf27 100644 --- a/llvm/test/CodeGen/AArch64/mulcmle.ll +++ b/llvm/test/CodeGen/AArch64/mulcmle.ll @@ -1,11 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64 %s -o - -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <1 x i64> @v1i64(<1 x i64> %a) { -; CHECK-LABEL: v1i64: -; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v1i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v1i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: lsr x8, x8, #31 +; CHECK-GI-NEXT: and x8, x8, #0x100000001 +; CHECK-GI-NEXT: lsl x9, x8, #32 +; CHECK-GI-NEXT: sub x8, x9, x8 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ret %b = lshr <1 x i64> %a, %c = and <1 x i64> %b, %d = mul nuw <1 x i64> %c, @@ -13,10 +24,26 @@ define <1 x i64> @v1i64(<1 x i64> %a) { } define <2 x i64> @v2i64(<2 x i64> %a) { -; CHECK-LABEL: v2i64: -; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v2i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.4s, #1 +; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #31 +; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: fmov x11, d2 +; CHECK-GI-NEXT: mov x9, v2.d[1] +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret %b = lshr <2 x i64> %a, %c = and <2 x i64> %b, %d = mul nuw <2 x i64> %c, @@ -24,10 +51,19 @@ define <2 x i64> @v2i64(<2 x i64> %a) { } define <2 x i32> @v2i32(<2 x i32> %a) { -; CHECK-LABEL: v2i32: -; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v2i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.4h, #1 +; CHECK-GI-NEXT: ushr v0.2s, v0.2s, #15 +; CHECK-GI-NEXT: movi d2, #0x00ffff0000ffff +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: ret %b = lshr <2 x i32> %a, %c = and <2 x i32> %b, %d = mul nuw <2 x i32> %c, @@ -35,10 +71,19 @@ define <2 x i32> @v2i32(<2 x i32> %a) { } define <4 x i32> @v4i32(<4 x i32> %a) { -; CHECK-LABEL: v4i32: -; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.8h, #1 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15 +; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: ret %b = lshr <4 x i32> %a, %c = and <4 x i32> %b, %d = mul nuw <4 x i32> %c, @@ -46,11 +91,23 @@ define <4 x i32> @v4i32(<4 x i32> %a) { } define <8 x i32> @v8i32(<8 x i32> %a) { -; CHECK-LABEL: v8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 -; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v8i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v8i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.8h, #1 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15 +; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #15 +; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: mul v1.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: ret %b = lshr <8 x i32> %a, %c = and <8 x i32> %b, %d = mul nuw <8 x i32> %c, @@ -58,10 +115,19 @@ define <8 x i32> @v8i32(<8 x i32> %a) { } define <4 x i16> @v4i16(<4 x i16> %a) { -; CHECK-LABEL: v4i16: -; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.8b, #1 +; CHECK-GI-NEXT: ushr v0.4h, v0.4h, #7 +; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: ret %b = lshr <4 x i16> %a, %c = and <4 x i16> %b, %d = mul nuw <4 x i16> %c, @@ -69,10 +135,19 @@ define <4 x i16> @v4i16(<4 x i16> %a) { } define <8 x i16> @v8i16(<8 x i16> %a) { -; CHECK-LABEL: v8i16: -; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v8i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.16b, #1 +; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #7 +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: ret %b = lshr <8 x i16> %a, %c = and <8 x i16> %b, %d = mul nuw <8 x i16> %c, From 3082258d3a29664a66fcd35c104a40b8cf9d6cba Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Wed, 29 May 2024 14:42:48 +0200 Subject: [PATCH 115/230] [CodeGen][X86] Use TargetLowering for TypeInfo of PointerTy (#93469) This uses the TargetLowering getSimpleValueType mechanism to retrieve the ValueType info inside the X86 cost model. This resolves a build issue we were seeing for the miniQMC application after https://github.com/llvm/llvm-project/pull/92671. --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 3 +- ...dle-iptr-with-data-layout-to-not-assert.ll | 35 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index d935be7669f056..3b18e39d784b22 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -6257,7 +6257,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( AddressSpace, CostKind); unsigned VF = VecTy->getNumElements() / Factor; - MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); + MVT VT = + MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF); InstructionCost MaskCost; if (UseMaskedMemOp) { diff --git a/llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll b/llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll new file mode 100644 index 00000000000000..d0d414a869636b --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9] for VF [0-9] For instruction:\s*store ptr %[0-9], ptr %__last" --filter "LV: Found an estimated cost of [0-9] for VF [0-9] For instruction:\s*store ptr %[0-9]" --version 5 +; REQUIRES: asserts +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s +target triple = "x86_64-unknown-linux-gnu" + +define ptr @foo(ptr %__first, ptr %__last) #0 { +; CHECK-LABEL: 'foo' +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store ptr %0, ptr %__last, align 8 +; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: store ptr %0, ptr %__last, align 8 +; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction: store ptr %0, ptr %__last, align 8 +; CHECK: LV: Found an estimated cost of 3 for VF 8 For instruction: store ptr %0, ptr %__last, align 8 +; +entry: + %cmp.not1 = icmp eq ptr %__first, %__last + br i1 %cmp.not1, label %for.end, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.body: + %__first.addr.02 = phi ptr [ %incdec.ptr, %for.body ], [ %__first, %for.body.preheader ] + %0 = load ptr, ptr %__first.addr.02, align 8 + store ptr %0, ptr %__last, align 8 + %incdec.ptr = getelementptr inbounds i8, ptr %__first.addr.02, i64 16 + %cmp.not = icmp eq ptr %incdec.ptr, %__last + br i1 %cmp.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret ptr null +} + +attributes #0 = { "target-cpu"="znver4" } From 103f6a7606fdc128041bb2e17fb0e992fc6f2225 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 29 May 2024 08:58:45 -0400 Subject: [PATCH 116/230] Reland "[gn] port 088aa81a5454 (LLVM_HAS_LOGF128)"" This reverts commit 9ebf2f8a67cce570d0752556fed23ff2803aef33. 088aa81a5454 relanded in 3613b2683107b. --- llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 1 + llvm/utils/gn/secondary/llvm/test/BUILD.gn | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index e93130eacdc74b..d8266fee05014b 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -318,6 +318,7 @@ write_cmake_config("llvm-config") { "LLVM_ENABLE_ZSTD=", "LLVM_FORCE_USE_OLD_TOOLCHAIN=", "LLVM_HAS_ATOMICS=1", + "LLVM_HAS_LOGF128=", "LLVM_HAVE_TFLITE=", "LLVM_HOST_TRIPLE=$llvm_current_triple", "LLVM_NATIVE_ARCH=$native_target", diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn index 826dcf4e6ee9b1..60d6d7b8c3ce7a 100644 --- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn @@ -64,6 +64,7 @@ write_lit_config("lit_site_cfg") { "LLVM_ENABLE_HTTPLIB=0", "LLVM_ENABLE_ZSTD=0", "LLVM_FORCE_VC_REVISION=", + "LLVM_HAS_LOGF128=0", "LLVM_HAVE_OPT_VIEWER_MODULES=0", "LLVM_HOST_TRIPLE=$llvm_current_triple", "LLVM_INCLUDE_DXIL_TESTS=0", From 9a282724a29899e84adc91bdeaf639010408a80d Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 29 May 2024 15:00:34 +0200 Subject: [PATCH 117/230] [Reassociate] Update test after recent change Fix test expectation after 3bcccb6af685c3132a9ee578b9e11b2503c35a5c. --- llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll b/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll index fcedde23ecc7fd..bd0060cc5abbd9 100644 --- a/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll +++ b/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll @@ -57,13 +57,12 @@ define <8 x i1> @vector2(<8 x i1> %a, <8 x i1> %b0, <8 x i1> %b1, <8 x i1> %b2, ; CHECK-NEXT: [[OR6:%.*]] = or <8 x i1> [[B6]], [[A]] ; CHECK-NEXT: [[OR7:%.*]] = or <8 x i1> [[B7]], [[A]] ; CHECK-NEXT: [[XOR0:%.*]] = xor <8 x i1> [[OR1]], [[OR0]] -; CHECK-NEXT: [[XOR1:%.*]] = xor <8 x i1> [[XOR0]], [[OR2]] -; CHECK-NEXT: [[XOR2:%.*]] = xor <8 x i1> [[XOR1]], [[OR3]] -; CHECK-NEXT: [[XOR3:%.*]] = xor <8 x i1> [[XOR2]], [[OR4]] +; CHECK-NEXT: [[XOR2:%.*]] = xor <8 x i1> [[XOR0]], [[OR2]] +; CHECK-NEXT: [[OR045:%.*]] = xor <8 x i1> [[XOR2]], [[OR3]] +; CHECK-NEXT: [[XOR3:%.*]] = xor <8 x i1> [[OR045]], [[OR4]] ; CHECK-NEXT: [[XOR4:%.*]] = xor <8 x i1> [[XOR3]], [[OR5]] ; CHECK-NEXT: [[XOR5:%.*]] = xor <8 x i1> [[XOR4]], [[OR6]] ; CHECK-NEXT: [[XOR6:%.*]] = xor <8 x i1> [[XOR5]], [[OR7]] -; CHECK-NEXT: [[OR045:%.*]] = or <8 x i1> [[XOR1]], [[XOR0]] ; CHECK-NEXT: [[OR4560:%.*]] = or <8 x i1> [[OR045]], [[XOR2]] ; CHECK-NEXT: [[OR023:%.*]] = or <8 x i1> [[OR4560]], [[XOR3]] ; CHECK-NEXT: [[OR001:%.*]] = or <8 x i1> [[OR023]], [[XOR4]] From 23366d4153e1e521a7e5b88d42afea69fb888be7 Mon Sep 17 00:00:00 2001 From: chuongg3 Date: Wed, 29 May 2024 14:15:13 +0100 Subject: [PATCH 118/230] [AArch64][GlobalISel] Combine MUL(AND(LSHR(X, 15), 0x10001), 0xffff) to CMLTz (#92915) This patch mirrors the following SelectionDAG patch for GlobalISel: https://reviews.llvm.org/D130874 --- llvm/lib/Target/AArch64/AArch64Combine.td | 11 +- .../GISel/AArch64PostLegalizerCombiner.cpp | 55 +++++++++ llvm/test/CodeGen/AArch64/mulcmle.ll | 114 ++++-------------- 3 files changed, 90 insertions(+), 90 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 1c7f6b870d3904..1ce6cdf1c1e1ed 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -265,6 +265,14 @@ def or_to_bsp: GICombineRule < (apply [{ applyOrToBSP(*${root}, MRI, B, ${matchinfo}); }]) >; +// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz +def combine_mul_cmlt : GICombineRule< + (defs root:$root, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_MUL):$root, + [{ return matchCombineMulCMLT(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }]) +>; + // Post-legalization combines which should happen at all optimization levels. // (E.g. ones that facilitate matching for the selector) For example, matching // pseudos. @@ -296,5 +304,6 @@ def AArch64PostLegalizerCombiner split_store_zero_128, undef_combines, select_to_minmax, or_to_bsp, combine_concat_vector, commute_constant_to_rhs, - push_freeze_to_prevent_poison_from_propagating]> { + push_freeze_to_prevent_poison_from_propagating, + combine_mul_cmlt]> { } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index d8ca5494ba50a4..7f3e0e01ccd25c 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -381,6 +381,61 @@ void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI, MI.eraseFromParent(); } +// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz +bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI, + Register &SrcReg) { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + + if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) && + DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) && + DstTy != LLT::fixed_vector(8, 16)) + return false; + + auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); + if (AndMI->getOpcode() != TargetOpcode::G_AND) + return false; + auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI); + if (LShrMI->getOpcode() != TargetOpcode::G_LSHR) + return false; + + // Check the constant splat values + auto V1 = isConstantOrConstantSplatVector( + *MRI.getVRegDef(MI.getOperand(2).getReg()), MRI); + auto V2 = isConstantOrConstantSplatVector( + *MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI); + auto V3 = isConstantOrConstantSplatVector( + *MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI); + if (!V1.has_value() || !V2.has_value() || !V3.has_value()) + return false; + unsigned HalfSize = DstTy.getScalarSizeInBits() / 2; + if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) || + V3 != (HalfSize - 1)) + return false; + + SrcReg = LShrMI->getOperand(1).getReg(); + + return true; +} + +void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, Register &SrcReg) { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT HalfTy = + DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2)) + .changeElementSize(DstTy.getScalarSizeInBits() / 2); + + Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0); + Register CastReg = + B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0); + Register CMLTReg = + B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec) + .getReg(0); + + B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0); + MI.eraseFromParent(); +} + class AArch64PostLegalizerCombinerImpl : public Combiner { protected: // TODO: Make CombinerHelper methods const. diff --git a/llvm/test/CodeGen/AArch64/mulcmle.ll b/llvm/test/CodeGen/AArch64/mulcmle.ll index b22c75259adf27..32bc5c5e63b3e1 100644 --- a/llvm/test/CodeGen/AArch64/mulcmle.ll +++ b/llvm/test/CodeGen/AArch64/mulcmle.ll @@ -24,26 +24,10 @@ define <1 x i64> @v1i64(<1 x i64> %a) { } define <2 x i64> @v2i64(<2 x i64> %a) { -; CHECK-SD-LABEL: v2i64: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmlt v0.4s, v0.4s, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v2i64: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.4s, #1 -; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #31 -; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: fmov x11, d2 -; CHECK-GI-NEXT: mov x9, v2.d[1] -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: ret %b = lshr <2 x i64> %a, %c = and <2 x i64> %b, %d = mul nuw <2 x i64> %c, @@ -51,19 +35,10 @@ define <2 x i64> @v2i64(<2 x i64> %a) { } define <2 x i32> @v2i32(<2 x i32> %a) { -; CHECK-SD-LABEL: v2i32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v2i32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.4h, #1 -; CHECK-GI-NEXT: ushr v0.2s, v0.2s, #15 -; CHECK-GI-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: ret %b = lshr <2 x i32> %a, %c = and <2 x i32> %b, %d = mul nuw <2 x i32> %c, @@ -71,19 +46,10 @@ define <2 x i32> @v2i32(<2 x i32> %a) { } define <4 x i32> @v4i32(<4 x i32> %a) { -; CHECK-SD-LABEL: v4i32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v4i32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.8h, #1 -; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15 -; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: mul v0.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: ret %b = lshr <4 x i32> %a, %c = and <4 x i32> %b, %d = mul nuw <4 x i32> %c, @@ -91,23 +57,11 @@ define <4 x i32> @v4i32(<4 x i32> %a) { } define <8 x i32> @v8i32(<8 x i32> %a) { -; CHECK-SD-LABEL: v8i32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0 -; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v8i32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.8h, #1 -; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15 -; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #15 -; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff -; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s -; CHECK-GI-NEXT: mul v1.4s, v1.4s, v3.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: ret %b = lshr <8 x i32> %a, %c = and <8 x i32> %b, %d = mul nuw <8 x i32> %c, @@ -115,19 +69,10 @@ define <8 x i32> @v8i32(<8 x i32> %a) { } define <4 x i16> @v4i16(<4 x i16> %a) { -; CHECK-SD-LABEL: v4i16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmlt v0.8b, v0.8b, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v4i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.8b, #1 -; CHECK-GI-NEXT: ushr v0.4h, v0.4h, #7 -; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff -; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: ret %b = lshr <4 x i16> %a, %c = and <4 x i16> %b, %d = mul nuw <4 x i16> %c, @@ -135,19 +80,10 @@ define <4 x i16> @v4i16(<4 x i16> %a) { } define <8 x i16> @v8i16(<8 x i16> %a) { -; CHECK-SD-LABEL: v8i16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v8i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.16b, #1 -; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #7 -; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: ret %b = lshr <8 x i16> %a, %c = and <8 x i16> %b, %d = mul nuw <8 x i16> %c, From 4ffe26334e563a3fea70c2a05de0410a2a3856d7 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 29 May 2024 15:23:59 +0200 Subject: [PATCH 119/230] [InstSimplify] Generate test checks (NFC) --- .../InstSimplify/ConstProp/vectorgep-crash.ll | 19 +++-- .../Transforms/InstSimplify/vector_gep.ll | 73 +++++++++++-------- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll index 5f554501206206..00ee7f8a92b218 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=instcombine -S -o - %s | FileCheck %s ; Tests that we don't crash upon encountering a vector GEP @@ -23,17 +24,21 @@ top: %struct.C = type { i64 } @G = internal global [65 x %struct.A] zeroinitializer, align 16 -; CHECK-LABEL: @test -; CHECK: ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> , i32 0) define <16 x ptr> @test() { +; CHECK-LABEL: define <16 x ptr> @test() { +; CHECK-NEXT: [[VECTOR_BODY:.*:]] +; CHECK-NEXT: ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> , i32 0) +; vector.body: %VectorGep = getelementptr [65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> , <16 x i32> zeroinitializer ret <16 x ptr> %VectorGep } -; CHECK-LABEL: @test2 -; CHECK: ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> @test2() { +; CHECK-LABEL: define <16 x ptr> @test2() { +; CHECK-NEXT: [[VECTOR_BODY:.*:]] +; CHECK-NEXT: ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> , i32 0) +; vector.body: %VectorGep = getelementptr [65 x %struct.A], ptr @G, <16 x i32> zeroinitializer, <16 x i64> , <16 x i32> zeroinitializer ret <16 x ptr> %VectorGep @@ -42,7 +47,7 @@ vector.body: @g = external global i8, align 1 define <2 x ptr> @constant_zero_index() { -; CHECK-LABEL: @constant_zero_index( +; CHECK-LABEL: define <2 x ptr> @constant_zero_index() { ; CHECK-NEXT: ret <2 x ptr> ; %gep = getelementptr i8, ptr @g, <2 x i64> zeroinitializer @@ -50,7 +55,7 @@ define <2 x ptr> @constant_zero_index() { } define <2 x ptr> @constant_undef_index() { -; CHECK-LABEL: @constant_undef_index( +; CHECK-LABEL: define <2 x ptr> @constant_undef_index() { ; CHECK-NEXT: ret <2 x ptr> ; %gep = getelementptr i8, ptr @g, <2 x i64> undef @@ -58,7 +63,7 @@ define <2 x ptr> @constant_undef_index() { } define <2 x ptr> @constant_inbounds() { -; CHECK-LABEL: @constant_inbounds( +; CHECK-LABEL: define <2 x ptr> @constant_inbounds() { ; CHECK-NEXT: ret <2 x ptr> getelementptr inbounds (i8, ptr @g, <2 x i64> ) ; %gep = getelementptr i8, ptr @g, <2 x i64> diff --git a/llvm/test/Transforms/InstSimplify/vector_gep.ll b/llvm/test/Transforms/InstSimplify/vector_gep.ll index ba0d978ed5b3cf..79aa9f13d1ea72 100644 --- a/llvm/test/Transforms/InstSimplify/vector_gep.ll +++ b/llvm/test/Transforms/InstSimplify/vector_gep.ll @@ -1,105 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=instsimplify < %s | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" declare void @helper(<2 x ptr>) define void @test(<2 x ptr> %a) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: <2 x ptr> [[A:%.*]]) { +; CHECK-NEXT: call void @helper(<2 x ptr> [[A]]) +; CHECK-NEXT: ret void +; %A = getelementptr i8, <2 x ptr> %a, <2 x i32> call void @helper(<2 x ptr> %A) ret void } define <4 x ptr> @test1(<4 x ptr> %a) { +; CHECK-LABEL: define <4 x ptr> @test1( +; CHECK-SAME: <4 x ptr> [[A:%.*]]) { +; CHECK-NEXT: ret <4 x ptr> [[A]] +; %gep = getelementptr i8, <4 x ptr> %a, <4 x i32> zeroinitializer ret <4 x ptr> %gep - -; CHECK-LABEL: @test1 -; CHECK-NEXT: ret <4 x ptr> %a } define <4 x ptr> @test2(<4 x ptr> %a) { +; CHECK-LABEL: define <4 x ptr> @test2( +; CHECK-SAME: <4 x ptr> [[A:%.*]]) { +; CHECK-NEXT: ret <4 x ptr> [[A]] +; %gep = getelementptr i8, <4 x ptr> %a ret <4 x ptr> %gep - -; CHECK-LABEL: @test2 -; CHECK-NEXT: ret <4 x ptr> %a } %struct = type { double, float } define <4 x ptr> @test3() { +; CHECK-LABEL: define <4 x ptr> @test3() { +; CHECK-NEXT: ret <4 x ptr> undef +; %gep = getelementptr %struct, <4 x ptr> undef, <4 x i32> , <4 x i32> ret <4 x ptr> %gep - -; CHECK-LABEL: @test3 -; CHECK-NEXT: ret <4 x ptr> undef } %struct.empty = type { } define <4 x ptr> @test4(<4 x ptr> %a) { +; CHECK-LABEL: define <4 x ptr> @test4( +; CHECK-SAME: <4 x ptr> [[A:%.*]]) { +; CHECK-NEXT: ret <4 x ptr> [[A]] +; %gep = getelementptr %struct.empty, <4 x ptr> %a, <4 x i32> ret <4 x ptr> %gep - -; CHECK-LABEL: @test4 -; CHECK-NEXT: ret <4 x ptr> %a } define <4 x ptr> @test5() { +; CHECK-LABEL: define <4 x ptr> @test5() { +; CHECK-NEXT: ret <4 x ptr> getelementptr (i8, <4 x ptr> , <4 x i64> ) +; %c = inttoptr <4 x i64> to <4 x ptr> %gep = getelementptr i8, <4 x ptr> %c, <4 x i32> ret <4 x ptr> %gep - -; CHECK-LABEL: @test5 -; CHECK-NEXT: ret <4 x ptr> getelementptr (i8, <4 x ptr> , <4 x i64> ) } @v = global [24 x [42 x [3 x i32]]] zeroinitializer, align 16 define <16 x ptr> @test6() { -; CHECK-LABEL: @test6 -; CHECK-NEXT: ret <16 x ptr> getelementptr inbounds ([24 x [42 x [3 x i32]]], ptr @v, <16 x i64> zeroinitializer, <16 x i64> zeroinitializer, <16 x i64> , <16 x i64> zeroinitializer) +; CHECK-LABEL: define <16 x ptr> @test6() { +; CHECK-NEXT: ret <16 x ptr> getelementptr inbounds ([24 x [42 x [3 x i32]]], ptr @v, <16 x i64> zeroinitializer, <16 x i64> zeroinitializer, <16 x i64> , <16 x i64> zeroinitializer) +; %VectorGep = getelementptr [24 x [42 x [3 x i32]]], ptr @v, i64 0, i64 0, <16 x i64> , i64 0 ret <16 x ptr> %VectorGep } ; PR32697 -; CHECK-LABEL: tinkywinky( -; CHECK-NEXT: ret <4 x ptr> undef define <4 x ptr> @tinkywinky() { +; CHECK-LABEL: define <4 x ptr> @tinkywinky() { +; CHECK-NEXT: ret <4 x ptr> undef +; %patatino = getelementptr i8, ptr undef, <4 x i64> undef ret <4 x ptr> %patatino } ; PR32697 -; CHECK-LABEL: dipsy( -; CHECK-NEXT: ret <4 x ptr> undef define <4 x ptr> @dipsy() { +; CHECK-LABEL: define <4 x ptr> @dipsy() { +; CHECK-NEXT: ret <4 x ptr> undef +; %patatino = getelementptr i8, <4 x ptr> undef, <4 x i64> undef ret <4 x ptr> %patatino } ; PR32697 -; CHECK-LABEL: laalaa( -; CHECK-NEXT: ret <4 x ptr> undef define <4 x ptr> @laalaa() { +; CHECK-LABEL: define <4 x ptr> @laalaa() { +; CHECK-NEXT: ret <4 x ptr> undef +; %patatino = getelementptr i8, <4 x ptr> undef, i64 undef ret <4 x ptr> %patatino } define <2 x ptr> @zero_index(ptr %p) { -; CHECK-LABEL: @zero_index( -; CHECK-NEXT: %gep = getelementptr i8, ptr %p, <2 x i64> zeroinitializer -; CHECK-NEXT: ret <2 x ptr> %gep +; CHECK-LABEL: define <2 x ptr> @zero_index( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[P]], <2 x i64> zeroinitializer +; CHECK-NEXT: ret <2 x ptr> [[GEP]] ; %gep = getelementptr i8, ptr %p, <2 x i64> zeroinitializer ret <2 x ptr> %gep } define <2 x ptr> @unsized(ptr %p) { -; CHECK-LABEL: @unsized( -; CHECK-NEXT: %gep = getelementptr {}, ptr %p, <2 x i64> undef -; CHECK-NEXT: ret <2 x ptr> %gep +; CHECK-LABEL: define <2 x ptr> @unsized( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr {}, ptr [[P]], <2 x i64> undef +; CHECK-NEXT: ret <2 x ptr> [[GEP]] ; %gep = getelementptr {}, ptr %p, <2 x i64> undef ret <2 x ptr> %gep From a49b5cad99ff84c2c9c55db1d5d9d4bfe1411777 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 29 May 2024 15:25:57 +0200 Subject: [PATCH 120/230] [InferAddressSpaces] Generate test checks (NFC) --- .../AMDGPU/infer-address-space.ll | 160 ++++++++++-------- .../InferAddressSpaces/NVPTX/bug31948.ll | 18 +- 2 files changed, 105 insertions(+), 73 deletions(-) diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll index 72109d0cff437e..4290e4f705887f 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll @@ -1,34 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s ; Ports of most of test/CodeGen/NVPTX/access-non-generic.ll @scalar = internal addrspace(3) global float 0.0, align 4 @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4 -; CHECK-LABEL: @load_store_lds_f32( -; CHECK: %tmp = load float, ptr addrspace(3) @scalar, align 4 -; CHECK: call void @use(float %tmp) -; CHECK: store float %v, ptr addrspace(3) @scalar, align 4 -; CHECK: call void @llvm.amdgcn.s.barrier() -; CHECK: %tmp2 = load float, ptr addrspace(3) @scalar, align 4 -; CHECK: call void @use(float %tmp2) -; CHECK: store float %v, ptr addrspace(3) @scalar, align 4 -; CHECK: call void @llvm.amdgcn.s.barrier() -; CHECK: %tmp3 = load float, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4 -; CHECK: call void @use(float %tmp3) -; CHECK: store float %v, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4 -; CHECK: call void @llvm.amdgcn.s.barrier() -; CHECK: %tmp4 = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 5 -; CHECK: %tmp5 = load float, ptr addrspace(3) %tmp4, align 4 -; CHECK: call void @use(float %tmp5) -; CHECK: store float %v, ptr addrspace(3) %tmp4, align 4 -; CHECK: call void @llvm.amdgcn.s.barrier() -; CHECK: %tmp7 = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 %i -; CHECK: %tmp8 = load float, ptr addrspace(3) %tmp7, align 4 -; CHECK: call void @use(float %tmp8) -; CHECK: store float %v, ptr addrspace(3) %tmp7, align 4 -; CHECK: call void @llvm.amdgcn.s.barrier() -; CHECK: ret void define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 { +; CHECK-LABEL: define amdgpu_kernel void @load_store_lds_f32( +; CHECK-SAME: i32 [[I:%.*]], float [[V:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP:%.*]] = load float, ptr addrspace(3) @scalar, align 4 +; CHECK-NEXT: call void @use(float [[TMP]]) +; CHECK-NEXT: store float [[V]], ptr addrspace(3) @scalar, align 4 +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(3) @scalar, align 4 +; CHECK-NEXT: call void @use(float [[TMP2]]) +; CHECK-NEXT: store float [[V]], ptr addrspace(3) @scalar, align 4 +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4 +; CHECK-NEXT: call void @use(float [[TMP3]]) +; CHECK-NEXT: store float [[V]], ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4 +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(3) [[TMP4]], align 4 +; CHECK-NEXT: call void @use(float [[TMP5]]) +; CHECK-NEXT: store float [[V]], ptr addrspace(3) [[TMP4]], align 4 +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 [[I]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(3) [[TMP7]], align 4 +; CHECK-NEXT: call void @use(float [[TMP8]]) +; CHECK-NEXT: store float [[V]], ptr addrspace(3) [[TMP7]], align 4 +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: ret void +; bb: %tmp = load float, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4 call void @use(float %tmp) @@ -57,20 +61,27 @@ bb: ret void } -; CHECK-LABEL: @constexpr_load_int_from_float_lds( -; CHECK: %tmp = load i32, ptr addrspace(3) @scalar, align 4 define i32 @constexpr_load_int_from_float_lds() #0 { +; CHECK-LABEL: define i32 @constexpr_load_int_from_float_lds( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(3) @scalar, align 4 +; CHECK-NEXT: ret i32 [[TMP]] +; bb: %tmp = load i32, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4 ret i32 %tmp } -; CHECK-LABEL: @load_int_from_global_float( -; CHECK: %tmp1 = getelementptr float, ptr addrspace(1) %input, i32 %i -; CHECK: %tmp2 = getelementptr float, ptr addrspace(1) %tmp1, i32 %j -; CHECK: %tmp4 = load i32, ptr addrspace(1) %tmp2 -; CHECK: ret i32 %tmp4 define i32 @load_int_from_global_float(ptr addrspace(1) %input, i32 %i, i32 %j) #0 { +; CHECK-LABEL: define i32 @load_int_from_global_float( +; CHECK-SAME: ptr addrspace(1) [[INPUT:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i32 [[I]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP1]], i32 [[J]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: ret i32 [[TMP4]] +; bb: %tmp = addrspacecast ptr addrspace(1) %input to ptr %tmp1 = getelementptr float, ptr %tmp, i32 %i @@ -79,20 +90,26 @@ bb: ret i32 %tmp4 } -; CHECK-LABEL: @nested_const_expr( -; CHECK: store i32 1, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i64 0, i64 1), align 4 define amdgpu_kernel void @nested_const_expr() #0 { +; CHECK-LABEL: define amdgpu_kernel void @nested_const_expr( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: store i32 1, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i64 0, i64 1), align 4 +; CHECK-NEXT: ret void +; store i32 1, ptr bitcast (ptr getelementptr ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i64 0, i64 1) to ptr), align 4 ret void } -; CHECK-LABEL: @rauw( -; CHECK: %addr = getelementptr float, ptr addrspace(1) %input, i64 10 -; CHECK-NEXT: %v = load float, ptr addrspace(1) %addr -; CHECK-NEXT: store float %v, ptr addrspace(1) %addr -; CHECK-NEXT: ret void define amdgpu_kernel void @rauw(ptr addrspace(1) %input) #0 { +; CHECK-LABEL: define amdgpu_kernel void @rauw( +; CHECK-SAME: ptr addrspace(1) [[INPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 10 +; CHECK-NEXT: [[V:%.*]] = load float, ptr addrspace(1) [[ADDR]], align 4 +; CHECK-NEXT: store float [[V]], ptr addrspace(1) [[ADDR]], align 4 +; CHECK-NEXT: ret void +; bb: %generic_input = addrspacecast ptr addrspace(1) %input to ptr %addr = getelementptr float, ptr %generic_input, i64 10 @@ -102,20 +119,22 @@ bb: } ; FIXME: Should be able to eliminate the cast inside the loop -; CHECK-LABEL: @loop( - -; CHECK: %end = getelementptr float, ptr addrspace(3) @array, i64 10 -; CHECK: br label %loop - -; CHECK: loop: ; preds = %loop, %entry -; CHECK: %i = phi ptr addrspace(3) [ @array, %entry ], [ %i2, %loop ] -; CHECK: %v = load float, ptr addrspace(3) %i -; CHECK: call void @use(float %v) -; CHECK: %i2 = getelementptr float, ptr addrspace(3) %i, i64 1 -; CHECK: %exit_cond = icmp eq ptr addrspace(3) %i2, %end - -; CHECK: br i1 %exit_cond, label %exit, label %loop define amdgpu_kernel void @loop() #0 { +; CHECK-LABEL: define amdgpu_kernel void @loop( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[END:%.*]] = getelementptr float, ptr addrspace(3) @array, i64 10 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[I:%.*]] = phi ptr addrspace(3) [ @array, %[[ENTRY]] ], [ [[I2:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[V:%.*]] = load float, ptr addrspace(3) [[I]], align 4 +; CHECK-NEXT: call void @use(float [[V]]) +; CHECK-NEXT: [[I2]] = getelementptr float, ptr addrspace(3) [[I]], i64 1 +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq ptr addrspace(3) [[I2]], [[END]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: %p = addrspacecast ptr addrspace(3) @array to ptr %end = getelementptr float, ptr %p, i64 10 @@ -135,19 +154,23 @@ exit: ; preds = %loop @generic_end = external addrspace(1) global ptr -; CHECK-LABEL: @loop_with_generic_bound( -; CHECK: %end = load ptr, ptr addrspace(1) @generic_end -; CHECK: br label %loop - -; CHECK: loop: -; CHECK: %i = phi ptr addrspace(3) [ @array, %entry ], [ %i2, %loop ] -; CHECK: %v = load float, ptr addrspace(3) %i -; CHECK: call void @use(float %v) -; CHECK: %i2 = getelementptr float, ptr addrspace(3) %i, i64 1 -; CHECK: %0 = addrspacecast ptr addrspace(3) %i2 to ptr -; CHECK: %exit_cond = icmp eq ptr %0, %end -; CHECK: br i1 %exit_cond, label %exit, label %loop define amdgpu_kernel void @loop_with_generic_bound() #0 { +; CHECK-LABEL: define amdgpu_kernel void @loop_with_generic_bound( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[END:%.*]] = load ptr, ptr addrspace(1) @generic_end, align 8 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[I:%.*]] = phi ptr addrspace(3) [ @array, %[[ENTRY]] ], [ [[I2:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[V:%.*]] = load float, ptr addrspace(3) [[I]], align 4 +; CHECK-NEXT: call void @use(float [[V]]) +; CHECK-NEXT: [[I2]] = getelementptr float, ptr addrspace(3) [[I]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq ptr [[TMP0]], [[END]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: %p = addrspacecast ptr addrspace(3) @array to ptr %end = load ptr, ptr addrspace(1) @generic_end @@ -165,11 +188,14 @@ exit: ; preds = %loop ret void } -; CHECK-LABEL: @select_bug( -; CHECK: %sel = select i1 icmp ne (ptr inttoptr (i64 4873 to ptr), ptr null), i64 73, i64 93 -; CHECK: %add.ptr157 = getelementptr inbounds i64, ptr undef, i64 %sel -; CHECK: %cmp169 = icmp uge ptr undef, %add.ptr157 define void @select_bug() #0 { +; CHECK-LABEL: define void @select_bug( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 icmp ne (ptr inttoptr (i64 4873 to ptr), ptr null), i64 73, i64 93 +; CHECK-NEXT: [[ADD_PTR157:%.*]] = getelementptr inbounds i64, ptr undef, i64 [[SEL]] +; CHECK-NEXT: [[CMP169:%.*]] = icmp uge ptr undef, [[ADD_PTR157]] +; CHECK-NEXT: unreachable +; %sel = select i1 icmp ne (ptr inttoptr (i64 4873 to ptr), ptr null), i64 73, i64 93 %add.ptr157 = getelementptr inbounds i64, ptr undef, i64 %sel %cmp169 = icmp uge ptr undef, %add.ptr157 diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll index e6b517a73fa463..23c5f99e5d0865 100644 --- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll +++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=nvptx64-nvidia-cuda -passes=infer-address-spaces %s | FileCheck %s target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" @@ -6,18 +7,23 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" @var1 = local_unnamed_addr addrspace(3) externally_initialized global %struct.bar undef, align 8 -; CHECK-LABEL: @bug31948( -; CHECK: %tmp = load ptr, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @var1, i64 0, i32 1), align 8 -; CHECK: %tmp1 = load float, ptr %tmp, align 4 -; CHECK: store float %conv1, ptr %tmp, align 4 -; CHECK: store i32 32, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @var1, i64 0, i32 1), align 4 define void @bug31948(float %a, ptr nocapture readnone %x, ptr nocapture readnone %y) local_unnamed_addr #0 { +; CHECK-LABEL: define void @bug31948( +; CHECK-SAME: float [[A:%.*]], ptr nocapture readnone [[X:%.*]], ptr nocapture readnone [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds ([[STRUCT_BAR:%.*]], ptr addrspace(3) @var1, i64 0, i32 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP]], align 4 +; CHECK-NEXT: [[CONV1:%.*]] = fadd float [[TMP1]], 1.000000e+00 +; CHECK-NEXT: store float [[CONV1]], ptr [[TMP]], align 4 +; CHECK-NEXT: store i32 32, ptr addrspace(3) getelementptr inbounds ([[STRUCT_BAR]], ptr addrspace(3) @var1, i64 0, i32 1), align 4 +; CHECK-NEXT: ret void +; entry: %tmp = load ptr, ptr getelementptr (%struct.bar, ptr addrspacecast (ptr addrspace(3) @var1 to ptr), i64 0, i32 1), align 8 %tmp1 = load float, ptr %tmp, align 4 %conv1 = fadd float %tmp1, 1.000000e+00 store float %conv1, ptr %tmp, align 4 - store i32 32, ptr bitcast (ptr getelementptr (%struct.bar, ptr addrspacecast (ptr addrspace(3) @var1 to ptr), i64 0, i32 1) to ptr), align 4 + store i32 32, ptr getelementptr (%struct.bar, ptr addrspacecast (ptr addrspace(3) @var1 to ptr), i64 0, i32 1), align 4 ret void } From 9377412c5a8bbfbee93029ef22b4b74949cbe1b5 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 29 May 2024 08:18:57 -0500 Subject: [PATCH 121/230] [clang][OpenMP] Remove unused include of UniqueVector.h, NFC --- clang/lib/Parse/ParseOpenMP.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index e959dd6378f46b..cd8df3332724f0 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -25,7 +25,6 @@ #include "clang/Sema/SemaOpenMP.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/UniqueVector.h" #include "llvm/Frontend/OpenMP/OMPAssume.h" #include "llvm/Frontend/OpenMP/OMPContext.h" #include From 1ea8caeada6efa991f7221f95fc6df581845895d Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Wed, 29 May 2024 14:32:22 +0100 Subject: [PATCH 122/230] [AArch64] Add patterns for conversions using fixed-point scvtf (#92922) --- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 25 +++++ .../AArch64/fixed-point-conv-vec-pat.ll | 104 ++++++++++++++++++ 3 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 814bbe27049820..3e2a5bfbc2321c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14365,7 +14365,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; return DAG.getNode(Opc, DL, VT, Op.getOperand(0), - DAG.getConstant(Cnt, DL, MVT::i32)); + DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags()); } // Right shift register. Note, there is not a shift right register diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 4830033b23527c..dd54520c8ddadd 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -733,6 +733,12 @@ def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>; def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>; def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>; + +def AArch64vashr_exact : PatFrag<(ops node:$lhs, node:$rhs), + (AArch64vashr node:$lhs, node:$rhs), [{ + return N->getFlags().hasExact(); +}]>; + def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>; def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>; def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>; @@ -7710,6 +7716,25 @@ defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf", defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", AArch64rshrn>; defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>; +let Predicates = [HasNEON] in { +def : Pat<(v2f32 (sint_to_fp (v2i32 (AArch64vashr_exact v2i32:$Vn, i32:$shift)))), + (SCVTFv2i32_shift $Vn, vecshiftR32:$shift)>; + +def : Pat<(v4f32 (sint_to_fp (v4i32 (AArch64vashr_exact v4i32:$Vn, i32:$shift)))), + (SCVTFv4i32_shift $Vn, vecshiftR32:$shift)>; + +def : Pat<(v2f64 (sint_to_fp (v2i64 (AArch64vashr_exact v2i64:$Vn, i32:$shift)))), + (SCVTFv2i64_shift $Vn, vecshiftR64:$shift)>; +} + +let Predicates = [HasNEON, HasFullFP16] in { +def : Pat<(v4f16 (sint_to_fp (v4i16 (AArch64vashr_exact v4i16:$Vn, i32:$shift)))), + (SCVTFv4i16_shift $Vn, vecshiftR16:$shift)>; + +def : Pat<(v8f16 (sint_to_fp (v8i16 (AArch64vashr_exact v8i16:$Vn, i32:$shift)))), + (SCVTFv8i16_shift $Vn, vecshiftR16:$shift)>; +} + // X << 1 ==> X + X class SHLToADDPat : Pat<(ty (AArch64vshl (ty regtype:$Rn), (i32 1))), diff --git a/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll new file mode 100644 index 00000000000000..dff216192a6c3c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64" + +; First some corner cases +define <4 x float> @f_v4_s0(<4 x i32> %u) { +; CHECK-LABEL: f_v4_s0: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf v0.4s, v0.4s +; CHECK-NEXT: ret + %s = ashr exact <4 x i32> %u, + %v = sitofp <4 x i32> %s to <4 x float> + ret <4 x float> %v +} + +define <4 x float> @f_v4_s1(<4 x i32> %u) { +; CHECK-LABEL: f_v4_s1: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf v0.4s, v0.4s, #1 +; CHECK-NEXT: ret + %s = ashr exact <4 x i32> %u, + %v = sitofp <4 x i32> %s to <4 x float> + ret <4 x float> %v +} + +define <4 x float> @f_v4_s24_inexact(<4 x i32> %u) { +; CHECK-LABEL: f_v4_s24_inexact: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.4s, v0.4s, #24 +; CHECK-NEXT: scvtf v0.4s, v0.4s +; CHECK-NEXT: ret + %s = ashr <4 x i32> %u, + %v = sitofp <4 x i32> %s to <4 x float> + ret <4 x float> %v +} + +define <4 x float> @f_v4_s31(<4 x i32> %u) { +; CHECK-LABEL: f_v4_s31: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: scvtf v0.4s, v0.4s +; CHECK-NEXT: ret + %s = ashr <4 x i32> %u, + %v = sitofp <4 x i32> %s to <4 x float> + ret <4 x float> %v +} + +; Common cases for conversion from signed integer to floating point types +define <2 x float> @f_v2_s24(<2 x i32> %u) { +; CHECK-LABEL: f_v2_s24: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf v0.2s, v0.2s, #24 +; CHECK-NEXT: ret + %s = ashr exact <2 x i32> %u, + %v = sitofp <2 x i32> %s to <2 x float> + ret <2 x float> %v +} + +define <4 x float> @f_v4_s24(<4 x i32> %u) { +; CHECK-LABEL: f_v4_s24: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf v0.4s, v0.4s, #24 +; CHECK-NEXT: ret + %s = ashr exact <4 x i32> %u, + %v = sitofp <4 x i32> %s to <4 x float> + ret <4 x float> %v +} + +; Check legalisation to <2 x f64> does not get in the way +define <8 x double> @d_v8_s64(<8 x i64> %u) { +; CHECK-LABEL: d_v8_s64: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf v0.2d, v0.2d, #56 +; CHECK-NEXT: scvtf v1.2d, v1.2d, #56 +; CHECK-NEXT: scvtf v2.2d, v2.2d, #56 +; CHECK-NEXT: scvtf v3.2d, v3.2d, #56 +; CHECK-NEXT: ret + %s = ashr exact <8 x i64> %u, + %v = sitofp <8 x i64> %s to <8 x double> + ret <8 x double> %v +} + +define <4 x half> @h_v4_s8(<4 x i16> %u) #0 { +; CHECK-LABEL: h_v4_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf v0.4h, v0.4h, #8 +; CHECK-NEXT: ret + %s = ashr exact <4 x i16> %u, + %v = sitofp <4 x i16> %s to <4 x half> + ret <4 x half> %v +} + +define <8 x half> @h_v8_s8(<8 x i16> %u) #0 { +; CHECK-LABEL: h_v8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf v0.8h, v0.8h, #8 +; CHECK-NEXT: ret + %s = ashr exact <8 x i16> %u, + %v = sitofp <8 x i16> %s to <8 x half> + ret <8 x half> %v +} + +attributes #0 = { "target-features"="+fullfp16"} From 0dfd2bf4dfd3fc8c0733678186ceb37776597d35 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 29 May 2024 15:36:08 +0200 Subject: [PATCH 123/230] [LTT] Directly create inbounds gep (NFCI) We know that this gep is inbounds. Constant expression construction already infers this fact, but make it explicit. --- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 633fcb3314c42f..f86f217bca5886 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -879,7 +879,7 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables( // Multiply by 2 to account for padding elements. Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, I * 2)}; - Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr( + Constant *CombinedGlobalElemPtr = ConstantExpr::getInBoundsGetElementPtr( NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs); assert(GV->getType()->getAddressSpace() == 0); GlobalAlias *GAlias = From 180448b13c2bfc94f4eef64d2352ad4cf94f01c7 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 29 May 2024 14:40:08 +0100 Subject: [PATCH 124/230] [AMDGPU] Reduce use of continue in SIWholeQuadMode. NFC. (#93659) --- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 92 +++++++++------------- 1 file changed, 36 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index ea8109bbee9aed..09dc1c781e2f30 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -278,11 +278,10 @@ LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { for (const MachineInstr &MI : *BII.first) { auto III = Instructions.find(&MI); - if (III == Instructions.end()) - continue; - - dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) - << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; + if (III != Instructions.end()) { + dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) + << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; + } } } } @@ -455,10 +454,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI, for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) { LiveRange &LR = LIS->getRegUnit(Unit); const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); - if (!Value) - continue; - - markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist); + if (Value) + markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist); } } } @@ -499,19 +496,16 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, if (TII->isWQM(Opcode)) { // If LOD is not supported WQM is not needed. - if (!ST->hasExtendedImageInsts()) - continue; // Only generate implicit WQM if implicit derivatives are required. // This avoids inserting unintended WQM if a shader type without // implicit derivatives uses an image sampling instruction. - if (!HasImplicitDerivatives) - continue; - // Sampling instructions don't need to produce results for all pixels - // in a quad, they just require all inputs of a quad to have been - // computed for derivatives. - markInstructionUses(MI, StateWQM, Worklist); - GlobalFlags |= StateWQM; - continue; + if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) { + // Sampling instructions don't need to produce results for all pixels + // in a quad, they just require all inputs of a quad to have been + // computed for derivatives. + markInstructionUses(MI, StateWQM, Worklist); + GlobalFlags |= StateWQM; + } } else if (Opcode == AMDGPU::WQM) { // The WQM intrinsic requires its output to have all the helper lanes // correct, so we need it to be in WQM. @@ -520,7 +514,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, } else if (Opcode == AMDGPU::SOFT_WQM) { LowerToCopyInstrs.push_back(&MI); SoftWQMInstrs.push_back(&MI); - continue; } else if (Opcode == AMDGPU::STRICT_WWM) { // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus // it needs to be executed in WQM or Exact so that its copy doesn't @@ -528,7 +521,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, markInstructionUses(MI, StateStrictWWM, Worklist); GlobalFlags |= StateStrictWWM; LowerToMovInstrs.push_back(&MI); - continue; } else if (Opcode == AMDGPU::STRICT_WQM || TII->isDualSourceBlendEXP(MI)) { // STRICT_WQM is similar to STRICTWWM, but instead of enabling all @@ -551,7 +543,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, GlobalFlags |= StateExact; III.Disabled = StateWQM | StateStrict; } - continue; } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || Opcode == AMDGPU::DS_PARAM_LOAD || Opcode == AMDGPU::LDS_DIRECT_LOAD || @@ -561,7 +552,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, InstrInfo &II = Instructions[&MI]; II.Needs |= StateStrictWQM; GlobalFlags |= StateStrictWQM; - continue; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { III.Disabled = StateStrict; @@ -574,7 +564,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, } } SetInactiveInstrs.push_back(&MI); - continue; } else if (TII->isDisableWQM(MI)) { BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { @@ -583,40 +572,33 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, } GlobalFlags |= StateExact; III.Disabled = StateWQM | StateStrict; - continue; - } else { - if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) { - LiveMaskQueries.push_back(&MI); - } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || - Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || - Opcode == AMDGPU::SI_DEMOTE_I1) { - KillInstrs.push_back(&MI); - BBI.NeedsLowering = true; - } else if (WQMOutputs) { - // The function is in machine SSA form, which means that physical - // VGPRs correspond to shader inputs and outputs. Inputs are - // only used, outputs are only defined. - // FIXME: is this still valid? - for (const MachineOperand &MO : MI.defs()) { - if (!MO.isReg()) - continue; - - Register Reg = MO.getReg(); - - if (!Reg.isVirtual() && - TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) { - Flags = StateWQM; - break; - } + } else if (Opcode == AMDGPU::SI_PS_LIVE || + Opcode == AMDGPU::SI_LIVE_MASK) { + LiveMaskQueries.push_back(&MI); + } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || + Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || + Opcode == AMDGPU::SI_DEMOTE_I1) { + KillInstrs.push_back(&MI); + BBI.NeedsLowering = true; + } else if (WQMOutputs) { + // The function is in machine SSA form, which means that physical + // VGPRs correspond to shader inputs and outputs. Inputs are + // only used, outputs are only defined. + // FIXME: is this still valid? + for (const MachineOperand &MO : MI.defs()) { + Register Reg = MO.getReg(); + if (Reg.isPhysical() && + TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) { + Flags = StateWQM; + break; } } - - if (!Flags) - continue; } - markInstruction(MI, Flags, Worklist); - GlobalFlags |= Flags; + if (Flags) { + markInstruction(MI, Flags, Worklist); + GlobalFlags |= Flags; + } } } @@ -1568,8 +1550,6 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: SplitPoint = lowerKillF32(*MBB, *MI); break; - default: - continue; } if (SplitPoint) splitBlock(MBB, SplitPoint); From 24ddce62c8bb92a19ba3677629c77a2e6f137b1a Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 29 May 2024 21:42:08 +0800 Subject: [PATCH 125/230] [GISel] Legalize bitreverse with types smaller than 8 bits (#92998) This patch adds support for lowering `bitreverse` with types smaller than 8 bits. It also fixes an existing assertion failure in `llvm::APInt::getSplat`: https://godbolt.org/z/7crs8xrcG The lowering logic is copied from SDAG: https://github.com/llvm/llvm-project/blob/2034f2fc8729bd4645ef7caa3c5c6efa284d2d3f/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp#L9384-L9398 --- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 64 ++-- .../test/CodeGen/RISCV/GlobalISel/bitmanip.ll | 207 ++++++++++++ .../legalizer/legalize-bitreverse-rv32.mir | 276 +++++++++++++++- .../legalizer/legalize-bitreverse-rv64.mir | 303 +++++++++++++++++- 4 files changed, 828 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index d8b0f52ecf9e32..9208b096affad9 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -7977,27 +7977,51 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerBitreverse(MachineInstr &MI) { auto [Dst, Src] = MI.getFirst2Regs(); const LLT Ty = MRI.getType(Src); - unsigned Size = Ty.getSizeInBits(); + unsigned Size = Ty.getScalarSizeInBits(); + + if (Size >= 8) { + MachineInstrBuilder BSWAP = + MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src}); + + // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654 + // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4] + // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0] + MachineInstrBuilder Swap4 = + SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0))); + + // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76 + // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2] + // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC] + MachineInstrBuilder Swap2 = + SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC))); + + // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 + // 6|7 + // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1] + // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA] + SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA))); + } else { + // Expand bitreverse for types smaller than 8 bits. + MachineInstrBuilder Tmp; + for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) { + MachineInstrBuilder Tmp2; + if (I < J) { + auto ShAmt = MIRBuilder.buildConstant(Ty, J - I); + Tmp2 = MIRBuilder.buildShl(Ty, Src, ShAmt); + } else { + auto ShAmt = MIRBuilder.buildConstant(Ty, I - J); + Tmp2 = MIRBuilder.buildLShr(Ty, Src, ShAmt); + } - MachineInstrBuilder BSWAP = - MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src}); - - // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654 - // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4] - // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0] - MachineInstrBuilder Swap4 = - SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0))); - - // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76 - // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2] - // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC] - MachineInstrBuilder Swap2 = - SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC))); - - // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7 - // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1] - // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA] - SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA))); + auto Mask = MIRBuilder.buildConstant(Ty, 1U << J); + Tmp2 = MIRBuilder.buildAnd(Ty, Tmp2, Mask); + if (I == 0) + Tmp = Tmp2; + else + Tmp = MIRBuilder.buildOr(Ty, Tmp, Tmp2); + } + MIRBuilder.buildCopy(Dst, Tmp); + } MI.eraseFromParent(); return Legalized; diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll new file mode 100644 index 00000000000000..5c42fefb95b39f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s --check-prefixes=RV32 +; RUN: llc -mtriple=riscv64 -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s --check-prefixes=RV64 + +define i2 @bitreverse_i2(i2 %x) { +; RV32-LABEL: bitreverse_i2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a0, 1 +; RV32-NEXT: andi a1, a1, 2 +; RV32-NEXT: andi a0, a0, 3 +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: bitreverse_i2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a0, 1 +; RV64-NEXT: andi a1, a1, 2 +; RV64-NEXT: andi a0, a0, 3 +; RV64-NEXT: srliw a0, a0, 1 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret + %rev = call i2 @llvm.bitreverse.i2(i2 %x) + ret i2 %rev +} + +define i3 @bitreverse_i3(i3 %x) { +; RV32-LABEL: bitreverse_i3: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a0, 2 +; RV32-NEXT: andi a1, a1, 4 +; RV32-NEXT: andi a0, a0, 7 +; RV32-NEXT: andi a2, a0, 2 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: bitreverse_i3: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a0, 2 +; RV64-NEXT: andi a1, a1, 4 +; RV64-NEXT: andi a0, a0, 7 +; RV64-NEXT: andi a2, a0, 2 +; RV64-NEXT: or a1, a1, a2 +; RV64-NEXT: srliw a0, a0, 2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret + %rev = call i3 @llvm.bitreverse.i3(i3 %x) + ret i3 %rev +} + +define i4 @bitreverse_i4(i4 %x) { +; RV32-LABEL: bitreverse_i4: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a0, 3 +; RV32-NEXT: andi a1, a1, 8 +; RV32-NEXT: slli a2, a0, 1 +; RV32-NEXT: andi a2, a2, 4 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: andi a0, a0, 15 +; RV32-NEXT: srli a2, a0, 1 +; RV32-NEXT: andi a2, a2, 2 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: srli a0, a0, 3 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: bitreverse_i4: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a0, 3 +; RV64-NEXT: andi a1, a1, 8 +; RV64-NEXT: slli a2, a0, 1 +; RV64-NEXT: andi a2, a2, 4 +; RV64-NEXT: or a1, a1, a2 +; RV64-NEXT: andi a0, a0, 15 +; RV64-NEXT: srliw a2, a0, 1 +; RV64-NEXT: andi a2, a2, 2 +; RV64-NEXT: or a1, a1, a2 +; RV64-NEXT: srliw a0, a0, 3 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret + %rev = call i4 @llvm.bitreverse.i4(i4 %x) + ret i4 %rev +} + +define i7 @bitreverse_i7(i7 %x) { +; RV32-LABEL: bitreverse_i7: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a0, 6 +; RV32-NEXT: andi a1, a1, 64 +; RV32-NEXT: slli a2, a0, 4 +; RV32-NEXT: andi a2, a2, 32 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: slli a2, a0, 2 +; RV32-NEXT: andi a2, a2, 16 +; RV32-NEXT: andi a0, a0, 127 +; RV32-NEXT: andi a3, a0, 8 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: srli a2, a0, 2 +; RV32-NEXT: andi a2, a2, 4 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: andi a3, a3, 2 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: srli a0, a0, 6 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: bitreverse_i7: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a0, 6 +; RV64-NEXT: andi a1, a1, 64 +; RV64-NEXT: slli a2, a0, 4 +; RV64-NEXT: andi a2, a2, 32 +; RV64-NEXT: or a1, a1, a2 +; RV64-NEXT: slli a2, a0, 2 +; RV64-NEXT: andi a2, a2, 16 +; RV64-NEXT: andi a0, a0, 127 +; RV64-NEXT: andi a3, a0, 8 +; RV64-NEXT: or a2, a2, a3 +; RV64-NEXT: or a1, a1, a2 +; RV64-NEXT: srliw a2, a0, 2 +; RV64-NEXT: andi a2, a2, 4 +; RV64-NEXT: srliw a3, a0, 4 +; RV64-NEXT: andi a3, a3, 2 +; RV64-NEXT: or a2, a2, a3 +; RV64-NEXT: or a1, a1, a2 +; RV64-NEXT: srliw a0, a0, 6 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret + %rev = call i7 @llvm.bitreverse.i7(i7 %x) + ret i7 %rev +} + +define i24 @bitreverse_i24(i24 %x) { +; RV32-LABEL: bitreverse_i24: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a0, 16 +; RV32-NEXT: lui a2, 4096 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: lui a1, 1048335 +; RV32-NEXT: addi a1, a1, 240 +; RV32-NEXT: and a3, a1, a2 +; RV32-NEXT: and a3, a0, a3 +; RV32-NEXT: srli a3, a3, 4 +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: lui a1, 1047757 +; RV32-NEXT: addi a1, a1, -820 +; RV32-NEXT: and a3, a1, a2 +; RV32-NEXT: and a3, a0, a3 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: lui a1, 1047211 +; RV32-NEXT: addi a1, a1, -1366 +; RV32-NEXT: and a2, a1, a2 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: bitreverse_i24: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a0, 16 +; RV64-NEXT: lui a2, 4096 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: srliw a0, a0, 16 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: lui a1, 1048335 +; RV64-NEXT: addi a1, a1, 240 +; RV64-NEXT: and a3, a1, a2 +; RV64-NEXT: and a3, a0, a3 +; RV64-NEXT: srliw a3, a3, 4 +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a3, a0 +; RV64-NEXT: lui a1, 1047757 +; RV64-NEXT: addi a1, a1, -820 +; RV64-NEXT: and a3, a1, a2 +; RV64-NEXT: and a3, a0, a3 +; RV64-NEXT: srliw a3, a3, 2 +; RV64-NEXT: slli a0, a0, 2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a3, a0 +; RV64-NEXT: lui a1, 1047211 +; RV64-NEXT: addiw a1, a1, -1366 +; RV64-NEXT: and a2, a1, a2 +; RV64-NEXT: and a2, a0, a2 +; RV64-NEXT: srliw a2, a2, 1 +; RV64-NEXT: slliw a0, a0, 1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: ret + %rev = call i24 @llvm.bitreverse.i24(i24 %x) + ret i24 %rev +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir index 5044514babe54a..7625a5c2d568a9 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=riscv32 -mattr=+v -global-isel-abort=0 -run-pass=legalizer %s -o - | FileCheck %s --- name: bitreverse_i8 @@ -248,3 +248,277 @@ body: | PseudoRET implicit $x10, implicit $x11 ... +--- +name: bitreverse_i2 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]] + ; CHECK-NEXT: $x10 = COPY [[OR]](s32) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s32) = COPY $x10 + %0:_(s2) = G_TRUNC %1(s32) + %2:_(s2) = G_BITREVERSE %0 + %3:_(s32) = G_ANYEXT %2(s2) + $x10 = COPY %3(s32) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_i3 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i3 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C6]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C5]](s32) + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C7]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND4]] + ; CHECK-NEXT: $x10 = COPY [[OR1]](s32) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s32) = COPY $x10 + %0:_(s3) = G_TRUNC %1(s32) + %2:_(s3) = G_BITREVERSE %0 + %3:_(s32) = G_ANYEXT %2(s3) + $x10 = COPY %3(s32) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_i4 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i4 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C5]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C6]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND3]] + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C8]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[C7]](s32) + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C9]] + ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND5]] + ; CHECK-NEXT: $x10 = COPY [[OR2]](s32) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s32) = COPY $x10 + %0:_(s4) = G_TRUNC %1(s32) + %2:_(s4) = G_BITREVERSE %0 + %3:_(s32) = G_ANYEXT %2(s4) + $x10 = COPY %3(s32) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_i7 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i7 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C4]](s32) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND2]] + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C7]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C6]](s32) + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C8]] + ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND4]] + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C10]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C9]](s32) + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C11]] + ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[AND6]] + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C13]] + ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[C12]](s32) + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C14]] + ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[AND8]] + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C16]] + ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[C15]](s32) + ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C17]] + ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[AND10]] + ; CHECK-NEXT: $x10 = COPY [[OR5]](s32) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s32) = COPY $x10 + %0:_(s7) = G_TRUNC %1(s32) + %2:_(s7) = G_BITREVERSE %0 + %3:_(s32) = G_ANYEXT %2(s7) + $x10 = COPY %3(s32) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_i24 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i24 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -986896 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C3]] + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C4]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[OR]], [[C2]](s32) + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[AND3]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 -3355444 + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C6]] + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[AND4]], [[C7]] + ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C5]](s32) + ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C5]](s32) + ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C6]] + ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[AND6]] + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 -5592406 + ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[OR2]], [[C9]] + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[AND7]], [[C10]] + ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[C8]](s32) + ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[OR2]], [[C8]](s32) + ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C9]] + ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[AND9]] + ; CHECK-NEXT: $x10 = COPY [[OR3]](s32) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s32) = COPY $x10 + %0:_(s24) = G_TRUNC %1(s32) + %2:_(s24) = G_BITREVERSE %0 + %3:_(s32) = G_ANYEXT %2(s24) + $x10 = COPY %3(s32) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_v2i4 +body: | + bb.1.entry: + + ; CHECK-LABEL: name: bitreverse_v2i4 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s4>) = G_TRUNC [[COPY]](<2 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s4) = G_CONSTANT i4 3 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C]](s4), [[C]](s4) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR]](<2 x s4>) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s4) = G_CONSTANT i4 -8 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C1]](s4), [[C1]](s4) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s4) = G_CONSTANT i4 1 + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C2]](s4), [[C2]](s4) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR2]](<2 x s4>) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s4) = G_CONSTANT i4 4 + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C3]](s4), [[C3]](s4) + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL1]], [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s4>) = G_OR [[AND]], [[AND1]] + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s4) = G_CONSTANT i4 1 + ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C4]](s4), [[C4]](s4) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR4]](<2 x s4>) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s4) = G_CONSTANT i4 2 + ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C5]](s4), [[C5]](s4) + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR]], [[BUILD_VECTOR5]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR]], [[AND2]] + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s4) = G_CONSTANT i4 3 + ; CHECK-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C6]](s4), [[C6]](s4) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR6]](<2 x s4>) + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s4) = G_CONSTANT i4 1 + ; CHECK-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C7]](s4), [[C7]](s4) + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR1]], [[BUILD_VECTOR7]] + ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR1]], [[AND3]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s4>) = COPY [[OR2]](<2 x s4>) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY1]](<2 x s4>) + ; CHECK-NEXT: $v8 = COPY [[ANYEXT]](<2 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<2 x s32>) = COPY $v8 + %0:_(<2 x s4>) = G_TRUNC %1(<2 x s32>) + %2:_(<2 x s4>) = G_BITREVERSE %0 + %3:_(<2 x s32>) = G_ANYEXT %2(<2 x s4>) + $v8 = COPY %3(<2 x s32>) + PseudoRET implicit $v8 + +... diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir index d1473504651668..71583f15cd5cd1 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -# RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=riscv64 -mattr=+v -global-isel-abort=0 -global-isel-abort=0 -run-pass=legalizer %s -o - | FileCheck %s --- name: bitreverse_i8 @@ -251,3 +251,304 @@ body: | PseudoRET implicit $x10 ... +--- +name: bitreverse_i2 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]] + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C2]] + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C3]](s64) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32) + ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s64) = COPY $x10 + %0:_(s2) = G_TRUNC %1(s64) + %2:_(s2) = G_BITREVERSE %0 + %3:_(s64) = G_ANYEXT %2(s2) + $x10 = COPY %3(s64) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_i3 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i3 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]] + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C2]] + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C3]](s64) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]] + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C5]] + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C6]](s64) + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C7]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND4]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s64) = COPY $x10 + %0:_(s3) = G_TRUNC %1(s64) + %2:_(s3) = G_BITREVERSE %0 + %3:_(s64) = G_ANYEXT %2(s3) + $x10 = COPY %3(s64) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_i4 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i4 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]] + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C2]](s64) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C4]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s64) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C6]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND3]] + ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]], [[C7]] + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[C8]](s64) + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C9]] + ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND5]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR2]](s32) + ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s64) = COPY $x10 + %0:_(s4) = G_TRUNC %1(s64) + %2:_(s4) = G_BITREVERSE %0 + %3:_(s64) = G_ANYEXT %2(s4) + $x10 = COPY %3(s64) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_i7 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i7 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]] + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C2]](s64) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[TRUNC2]], [[C4]](s64) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND2]] + ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]], [[C6]] + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C7]](s64) + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C8]] + ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND4]] + ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[TRUNC4]], [[C9]] + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C10]](s64) + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C11]] + ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[AND6]] + ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[TRUNC5]], [[C12]] + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[C13]](s64) + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C14]] + ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[AND8]] + ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[TRUNC6]], [[C15]] + ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 + ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[C16]](s64) + ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C17]] + ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[AND10]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s64) = COPY $x10 + %0:_(s7) = G_TRUNC %1(s64) + %2:_(s7) = G_BITREVERSE %0 + %3:_(s64) = G_ANYEXT %2(s7) + $x10 = COPY %3(s64) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_i24 +body: | + bb.1.entry: + liveins: $x10 + + ; CHECK-LABEL: name: bitreverse_i24 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C2]](s64) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]] + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -986896 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C3]] + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C4]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s64) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[OR]], [[C6]](s64) + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[AND3]] + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 -3355444 + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C7]] + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[AND4]], [[C8]] + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C9]](s64) + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C10]](s64) + ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C7]] + ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[AND6]] + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 -5592406 + ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[OR2]], [[C11]] + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[AND7]], [[C12]] + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[C13]](s64) + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[OR2]], [[C14]](s64) + ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C11]] + ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[AND9]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR3]](s32) + ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:_(s64) = COPY $x10 + %0:_(s24) = G_TRUNC %1(s64) + %2:_(s24) = G_BITREVERSE %0 + %3:_(s64) = G_ANYEXT %2(s24) + $x10 = COPY %3(s64) + PseudoRET implicit $x10 + +... +--- +name: bitreverse_v2i4 +body: | + bb.1.entry: + + ; CHECK-LABEL: name: bitreverse_v2i4 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s4>) = G_TRUNC [[COPY]](<2 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s4) = G_CONSTANT i4 3 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C]](s4), [[C]](s4) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR]](<2 x s4>) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s4) = G_CONSTANT i4 -8 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C1]](s4), [[C1]](s4) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s4) = G_CONSTANT i4 1 + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C2]](s4), [[C2]](s4) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR2]](<2 x s4>) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s4) = G_CONSTANT i4 4 + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C3]](s4), [[C3]](s4) + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL1]], [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s4>) = G_OR [[AND]], [[AND1]] + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s4) = G_CONSTANT i4 1 + ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C4]](s4), [[C4]](s4) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR4]](<2 x s4>) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s4) = G_CONSTANT i4 2 + ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C5]](s4), [[C5]](s4) + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR]], [[BUILD_VECTOR5]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR]], [[AND2]] + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s4) = G_CONSTANT i4 3 + ; CHECK-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C6]](s4), [[C6]](s4) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR6]](<2 x s4>) + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s4) = G_CONSTANT i4 1 + ; CHECK-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C7]](s4), [[C7]](s4) + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR1]], [[BUILD_VECTOR7]] + ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR1]], [[AND3]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s4>) = COPY [[OR2]](<2 x s4>) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY1]](<2 x s4>) + ; CHECK-NEXT: $v8 = COPY [[ANYEXT]](<2 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<2 x s32>) = COPY $v8 + %0:_(<2 x s4>) = G_TRUNC %1(<2 x s32>) + %2:_(<2 x s4>) = G_BITREVERSE %0 + %3:_(<2 x s32>) = G_ANYEXT %2(<2 x s4>) + $v8 = COPY %3(<2 x s32>) + PseudoRET implicit $v8 + +... From fba84ecc158ec4a9e0eae91d923d4a8f15e7ed6f Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 29 May 2024 15:42:09 +0200 Subject: [PATCH 126/230] [WPD] Directly create geteleementptr inbounds (NFCI) We know that this GEP is inbounds, so make it explicit. NFCI because constant expression construction already infers this. --- llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index e7a188e9431db5..9929ebb96dcafe 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -1927,7 +1927,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) { // element (the original initializer). auto Alias = GlobalAlias::create( B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "", - ConstantExpr::getGetElementPtr( + ConstantExpr::getInBoundsGetElementPtr( NewInit->getType(), NewGV, ArrayRef{ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 1)}), From 886d31675dbb6fe8cf97fd9083870bd043ce9f02 Mon Sep 17 00:00:00 2001 From: Tuan Chuong Goh Date: Wed, 29 May 2024 13:18:01 +0000 Subject: [PATCH 127/230] [AArch64][NFC] Pre-commit test for Push ADD/SUB through {S|Z}EXT (#90964) --- .../AArch64/GlobalISel/combine-add.mir | 119 ++ llvm/test/CodeGen/AArch64/neon-extadd.ll | 1785 +++++++++++++---- 2 files changed, 1514 insertions(+), 390 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir index fad3655da9d013..78411f34bebd31 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir @@ -207,3 +207,122 @@ body: | %3:_(<4 x s32>) = G_FADD %0, %2(<4 x s32>) $q0 = COPY %3(<4 x s32>) ... +--- +name: saddl_v8i8_v8i32 +tracksRegLiveness: true +body: | + bb.1: + liveins: $d0, $d1 + + ; CHECK-LABEL: name: saddl_v8i8_v8i32 + ; CHECK: liveins: $d0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1 + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[SEXT]], [[SEXT1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>) + ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>) + ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 + %0:_(<8 x s8>) = COPY $d0 + %1:_(<8 x s8>) = COPY $d1 + %2:_(<8 x s32>) = G_SEXT %0(<8 x s8>) + %3:_(<8 x s32>) = G_SEXT %1(<8 x s8>) + %4:_(<8 x s32>) = G_ADD %2, %3 + %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>) + $q0 = COPY %5(<4 x s32>) + $q1 = COPY %6(<4 x s32>) + RET_ReallyLR implicit $q0, implicit $q1 +... + +--- +name: uaddl_v8i8_v8i32 +tracksRegLiveness: true +body: | + bb.1: + liveins: $d0, $d1 + + ; CHECK-LABEL: name: uaddl_v8i8_v8i32 + ; CHECK: liveins: $d0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[ZEXT]], [[ZEXT1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>) + ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>) + ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 + %0:_(<8 x s8>) = COPY $d0 + %1:_(<8 x s8>) = COPY $d1 + %2:_(<8 x s32>) = G_ZEXT %0(<8 x s8>) + %3:_(<8 x s32>) = G_ZEXT %1(<8 x s8>) + %4:_(<8 x s32>) = G_ADD %2, %3 + %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>) + $q0 = COPY %5(<4 x s32>) + $q1 = COPY %6(<4 x s32>) + RET_ReallyLR implicit $q0, implicit $q1 +... + +--- +name: ssubl_v8i8_v8i32 +tracksRegLiveness: true +body: | + bb.1: + liveins: $d0, $d1 + + ; CHECK-LABEL: name: ssubl_v8i8_v8i32 + ; CHECK: liveins: $d0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1 + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[SEXT]], [[SEXT1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>) + ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>) + ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 + %0:_(<8 x s8>) = COPY $d0 + %1:_(<8 x s8>) = COPY $d1 + %2:_(<8 x s32>) = G_SEXT %0(<8 x s8>) + %3:_(<8 x s32>) = G_SEXT %1(<8 x s8>) + %4:_(<8 x s32>) = G_SUB %2, %3 + %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>) + $q0 = COPY %5(<4 x s32>) + $q1 = COPY %6(<4 x s32>) + RET_ReallyLR implicit $q0, implicit $q1 +... + +--- +name: usubl_v8i8_v8i32 +tracksRegLiveness: true +body: | + bb.1: + liveins: $d0, $d1 + + ; CHECK-LABEL: name: usubl_v8i8_v8i32 + ; CHECK: liveins: $d0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[ZEXT]], [[ZEXT1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>) + ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>) + ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 + %0:_(<8 x s8>) = COPY $d0 + %1:_(<8 x s8>) = COPY $d1 + %2:_(<8 x s32>) = G_ZEXT %0(<8 x s8>) + %3:_(<8 x s32>) = G_ZEXT %1(<8 x s8>) + %4:_(<8 x s32>) = G_SUB %2, %3 + %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>) + $q0 = COPY %5(<4 x s32>) + $q1 = COPY %6(<4 x s32>) + RET_ReallyLR implicit $q0, implicit $q1 +... diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 16200435c5c31d..6aa9c394a8fd1f 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple aarch64 -o - | FileCheck %s +; RUN: llc < %s -mtriple aarch64 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple aarch64 -o - -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <8 x i16> @extadds_v8i8_i16(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: extadds_v8i8_i16: @@ -26,12 +27,19 @@ entry: } define <16 x i16> @extadds_v16i8_i16(<16 x i8> %s0, <16 x i8> %s1) { -; CHECK-LABEL: extadds_v16i8_i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl2 v2.8h, v0.16b, v1.16b -; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v16i8_i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl2 v2.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v16i8_i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: saddl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: saddl2 v1.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %s0s = sext <16 x i8> %s0 to <16 x i16> %s1s = sext <16 x i8> %s1 to <16 x i16> @@ -40,12 +48,19 @@ entry: } define <16 x i16> @extaddu_v16i8_i16(<16 x i8> %s0, <16 x i8> %s1) { -; CHECK-LABEL: extaddu_v16i8_i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b -; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v16i8_i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl2 v2.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v16i8_i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: uaddl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uaddl2 v1.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i16> %s1s = zext <16 x i8> %s1 to <16 x i16> @@ -54,16 +69,26 @@ entry: } define <32 x i16> @extadds_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) { -; CHECK-LABEL: extadds_v32i8_i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl2 v4.8h, v1.16b, v3.16b -; CHECK-NEXT: saddl v5.8h, v0.8b, v2.8b -; CHECK-NEXT: saddl2 v6.8h, v0.16b, v2.16b -; CHECK-NEXT: saddl v2.8h, v1.8b, v3.8b -; CHECK-NEXT: mov v0.16b, v5.16b -; CHECK-NEXT: mov v1.16b, v6.16b -; CHECK-NEXT: mov v3.16b, v4.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v32i8_i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl2 v4.8h, v1.16b, v3.16b +; CHECK-SD-NEXT: saddl v5.8h, v0.8b, v2.8b +; CHECK-SD-NEXT: saddl2 v6.8h, v0.16b, v2.16b +; CHECK-SD-NEXT: saddl v2.8h, v1.8b, v3.8b +; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: mov v1.16b, v6.16b +; CHECK-SD-NEXT: mov v3.16b, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v32i8_i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: saddl v4.8h, v0.8b, v2.8b +; CHECK-GI-NEXT: saddl2 v5.8h, v0.16b, v2.16b +; CHECK-GI-NEXT: saddl v2.8h, v1.8b, v3.8b +; CHECK-GI-NEXT: saddl2 v3.8h, v1.16b, v3.16b +; CHECK-GI-NEXT: mov v0.16b, v4.16b +; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: ret entry: %s0s = sext <32 x i8> %s0 to <32 x i16> %s1s = sext <32 x i8> %s1 to <32 x i16> @@ -72,16 +97,26 @@ entry: } define <32 x i16> @extaddu_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) { -; CHECK-LABEL: extaddu_v32i8_i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl2 v4.8h, v1.16b, v3.16b -; CHECK-NEXT: uaddl v5.8h, v0.8b, v2.8b -; CHECK-NEXT: uaddl2 v6.8h, v0.16b, v2.16b -; CHECK-NEXT: uaddl v2.8h, v1.8b, v3.8b -; CHECK-NEXT: mov v0.16b, v5.16b -; CHECK-NEXT: mov v1.16b, v6.16b -; CHECK-NEXT: mov v3.16b, v4.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v32i8_i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl2 v4.8h, v1.16b, v3.16b +; CHECK-SD-NEXT: uaddl v5.8h, v0.8b, v2.8b +; CHECK-SD-NEXT: uaddl2 v6.8h, v0.16b, v2.16b +; CHECK-SD-NEXT: uaddl v2.8h, v1.8b, v3.8b +; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: mov v1.16b, v6.16b +; CHECK-SD-NEXT: mov v3.16b, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v32i8_i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: uaddl v4.8h, v0.8b, v2.8b +; CHECK-GI-NEXT: uaddl2 v5.8h, v0.16b, v2.16b +; CHECK-GI-NEXT: uaddl v2.8h, v1.8b, v3.8b +; CHECK-GI-NEXT: uaddl2 v3.8h, v1.16b, v3.16b +; CHECK-GI-NEXT: mov v0.16b, v4.16b +; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: ret entry: %s0s = zext <32 x i8> %s0 to <32 x i16> %s1s = zext <32 x i8> %s1 to <32 x i16> @@ -90,12 +125,20 @@ entry: } define <8 x i32> @extadds_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { -; CHECK-LABEL: extadds_v8i8_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 -; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v8i8_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v8i8_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: saddl v0.4s, v2.4h, v1.4h +; CHECK-GI-NEXT: saddl2 v1.4s, v2.8h, v1.8h +; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i32> %s1s = sext <8 x i8> %s1 to <8 x i32> @@ -104,12 +147,20 @@ entry: } define <8 x i32> @extaddu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { -; CHECK-LABEL: extaddu_v8i8_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v8i8_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v8i8_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: uaddl v0.4s, v2.4h, v1.4h +; CHECK-GI-NEXT: uaddl2 v1.4s, v2.8h, v1.8h +; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i8> %s0 to <8 x i32> %s1s = zext <8 x i8> %s1 to <8 x i32> @@ -117,16 +168,72 @@ entry: ret <8 x i32> %m } +define <8 x i32> @extsubs_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { +; CHECK-SD-LABEL: extsubs_v8i8_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ssubl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubs_v8i8_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ssubl v0.4s, v2.4h, v1.4h +; CHECK-GI-NEXT: ssubl2 v1.4s, v2.8h, v1.8h +; CHECK-GI-NEXT: ret +entry: + %s0s = sext <8 x i8> %s0 to <8 x i32> + %s1s = sext <8 x i8> %s1 to <8 x i32> + %m = sub <8 x i32> %s0s, %s1s + ret <8 x i32> %m +} + +define <8 x i32> @extsubu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { +; CHECK-SD-LABEL: extsubu_v8i8_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubu_v8i8_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: usubl v0.4s, v2.4h, v1.4h +; CHECK-GI-NEXT: usubl2 v1.4s, v2.8h, v1.8h +; CHECK-GI-NEXT: ret +entry: + %s0s = zext <8 x i8> %s0 to <8 x i32> + %s1s = zext <8 x i8> %s1 to <8 x i32> + %m = sub <8 x i32> %s0s, %s1s + ret <8 x i32> %m +} + define <16 x i32> @extadds_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { -; CHECK-LABEL: extadds_v16i8_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl v2.8h, v0.8b, v1.8b -; CHECK-NEXT: saddl2 v4.8h, v0.16b, v1.16b -; CHECK-NEXT: sshll v0.4s, v2.4h, #0 -; CHECK-NEXT: sshll2 v3.4s, v4.8h, #0 -; CHECK-NEXT: sshll2 v1.4s, v2.8h, #0 -; CHECK-NEXT: sshll v2.4s, v4.4h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v16i8_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: saddl2 v4.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v3.4s, v4.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v2.4s, v4.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v16i8_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v4.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll2 v5.8h, v1.16b, #0 +; CHECK-GI-NEXT: saddl v0.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: saddl2 v1.4s, v2.8h, v3.8h +; CHECK-GI-NEXT: saddl v2.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: saddl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ret entry: %s0s = sext <16 x i8> %s0 to <16 x i32> %s1s = sext <16 x i8> %s1 to <16 x i32> @@ -135,15 +242,27 @@ entry: } define <16 x i32> @extaddu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { -; CHECK-LABEL: extaddu_v16i8_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl v2.8h, v0.8b, v1.8b -; CHECK-NEXT: uaddl2 v4.8h, v0.16b, v1.16b -; CHECK-NEXT: ushll v0.4s, v2.4h, #0 -; CHECK-NEXT: ushll2 v3.4s, v4.8h, #0 -; CHECK-NEXT: ushll2 v1.4s, v2.8h, #0 -; CHECK-NEXT: ushll v2.4s, v4.4h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v16i8_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uaddl2 v4.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: ushll v0.4s, v2.4h, #0 +; CHECK-SD-NEXT: ushll2 v3.4s, v4.8h, #0 +; CHECK-SD-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-SD-NEXT: ushll v2.4s, v4.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v16i8_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v4.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll2 v5.8h, v1.16b, #0 +; CHECK-GI-NEXT: uaddl v0.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: uaddl2 v1.4s, v2.8h, v3.8h +; CHECK-GI-NEXT: uaddl v2.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: uaddl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> %s1s = zext <16 x i8> %s1 to <16 x i32> @@ -151,17 +270,89 @@ entry: ret <16 x i32> %m } +define <16 x i32> @extsubs_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-SD-LABEL: extsubs_v16i8_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ssubl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: ssubl2 v4.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v3.4s, v4.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v2.4s, v4.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubs_v16i8_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v4.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll2 v5.8h, v1.16b, #0 +; CHECK-GI-NEXT: ssubl v0.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: ssubl2 v1.4s, v2.8h, v3.8h +; CHECK-GI-NEXT: ssubl v2.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: ssubl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ret +entry: + %s0s = sext <16 x i8> %s0 to <16 x i32> + %s1s = sext <16 x i8> %s1 to <16 x i32> + %m = sub <16 x i32> %s0s, %s1s + ret <16 x i32> %m +} + +define <16 x i32> @extsubu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-SD-LABEL: extsubu_v16i8_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: usubl2 v4.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v3.4s, v4.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v2.4s, v4.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubu_v16i8_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v4.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll2 v5.8h, v1.16b, #0 +; CHECK-GI-NEXT: usubl v0.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: usubl2 v1.4s, v2.8h, v3.8h +; CHECK-GI-NEXT: usubl v2.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: usubl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ret +entry: + %s0s = zext <16 x i8> %s0 to <16 x i32> + %s1s = zext <16 x i8> %s1 to <16 x i32> + %m = sub <16 x i32> %s0s, %s1s + ret <16 x i32> %m +} + define <8 x i64> @extadds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { -; CHECK-LABEL: extadds_v8i8_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-NEXT: sshll2 v2.4s, v0.8h, #0 -; CHECK-NEXT: sshll v0.2d, v1.2s, #0 -; CHECK-NEXT: sshll2 v3.2d, v2.4s, #0 -; CHECK-NEXT: sshll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: sshll v2.2d, v2.2s, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v8i8_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v8i8_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: saddl v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: saddl2 v1.2d, v2.4s, v3.4s +; CHECK-GI-NEXT: saddl v2.2d, v4.2s, v5.2s +; CHECK-GI-NEXT: saddl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i64> %s1s = sext <8 x i8> %s1 to <8 x i64> @@ -170,16 +361,30 @@ entry: } define <8 x i64> @extaddu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { -; CHECK-LABEL: extaddu_v8i8_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 -; CHECK-NEXT: ushll v0.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v8i8_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: ushll v1.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-SD-NEXT: ushll v0.2d, v1.2s, #0 +; CHECK-SD-NEXT: ushll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: ushll2 v1.2d, v1.4s, #0 +; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v8i8_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: uaddl v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: uaddl2 v1.2d, v2.4s, v3.4s +; CHECK-GI-NEXT: uaddl v2.2d, v4.2s, v5.2s +; CHECK-GI-NEXT: uaddl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i8> %s0 to <8 x i64> %s1s = zext <8 x i8> %s1 to <8 x i64> @@ -187,6 +392,430 @@ entry: ret <8 x i64> %m } +define <8 x i64> @extsubs_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { +; CHECK-SD-LABEL: extsubs_v8i8_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ssubl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubs_v8i8_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: ssubl v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: ssubl2 v1.2d, v2.4s, v3.4s +; CHECK-GI-NEXT: ssubl v2.2d, v4.2s, v5.2s +; CHECK-GI-NEXT: ssubl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ret +entry: + %s0s = sext <8 x i8> %s0 to <8 x i64> + %s1s = sext <8 x i8> %s1 to <8 x i64> + %m = sub <8 x i64> %s0s, %s1s + ret <8 x i64> %m +} + +define <8 x i64> @extsubu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { +; CHECK-SD-LABEL: extsubu_v8i8_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubu_v8i8_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: usubl v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: usubl2 v1.2d, v2.4s, v3.4s +; CHECK-GI-NEXT: usubl v2.2d, v4.2s, v5.2s +; CHECK-GI-NEXT: usubl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ret +entry: + %s0s = zext <8 x i8> %s0 to <8 x i64> + %s1s = zext <8 x i8> %s1 to <8 x i64> + %m = sub <8 x i64> %s0s, %s1s + ret <8 x i64> %m +} + +define <16 x i64> @extaddu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) { +; CHECK-SD-LABEL: extaddu_v16i8_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: uaddl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uaddl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: ushll v3.4s, v2.4h, #0 +; CHECK-SD-NEXT: ushll2 v2.4s, v2.8h, #0 +; CHECK-SD-NEXT: ushll v5.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll2 v6.4s, v0.8h, #0 +; CHECK-SD-NEXT: ushll2 v1.2d, v3.4s, #0 +; CHECK-SD-NEXT: ushll v0.2d, v3.2s, #0 +; CHECK-SD-NEXT: ushll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: ushll v4.2d, v5.2s, #0 +; CHECK-SD-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-SD-NEXT: ushll2 v5.2d, v5.4s, #0 +; CHECK-SD-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v16i8_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll v6.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v16.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v17.4s, v1.8h, #0 +; CHECK-GI-NEXT: uaddl v0.2d, v4.2s, v2.2s +; CHECK-GI-NEXT: uaddl2 v1.2d, v4.4s, v2.4s +; CHECK-GI-NEXT: uaddl v2.2d, v5.2s, v3.2s +; CHECK-GI-NEXT: uaddl2 v3.2d, v5.4s, v3.4s +; CHECK-GI-NEXT: uaddl v4.2d, v6.2s, v7.2s +; CHECK-GI-NEXT: uaddl2 v5.2d, v6.4s, v7.4s +; CHECK-GI-NEXT: uaddl v6.2d, v16.2s, v17.2s +; CHECK-GI-NEXT: uaddl2 v7.2d, v16.4s, v17.4s +; CHECK-GI-NEXT: ret + %c = zext <16 x i8> %a to <16 x i64> + %d = zext <16 x i8> %b to <16 x i64> + %e = add <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <16 x i64> @extadds_v16i8_i64(<16 x i8> %a, <16 x i8> %b) { +; CHECK-SD-LABEL: extadds_v16i8_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saddl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: saddl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: sshll v3.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v5.4s, v0.4h, #0 +; CHECK-SD-NEXT: sshll2 v6.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.2d, v3.4s, #0 +; CHECK-SD-NEXT: sshll v0.2d, v3.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: sshll v4.2d, v5.2s, #0 +; CHECK-SD-NEXT: sshll2 v7.2d, v6.4s, #0 +; CHECK-SD-NEXT: sshll2 v5.2d, v5.4s, #0 +; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v16i8_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll v6.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v17.4s, v1.8h, #0 +; CHECK-GI-NEXT: saddl v0.2d, v4.2s, v2.2s +; CHECK-GI-NEXT: saddl2 v1.2d, v4.4s, v2.4s +; CHECK-GI-NEXT: saddl v2.2d, v5.2s, v3.2s +; CHECK-GI-NEXT: saddl2 v3.2d, v5.4s, v3.4s +; CHECK-GI-NEXT: saddl v4.2d, v6.2s, v7.2s +; CHECK-GI-NEXT: saddl2 v5.2d, v6.4s, v7.4s +; CHECK-GI-NEXT: saddl v6.2d, v16.2s, v17.2s +; CHECK-GI-NEXT: saddl2 v7.2d, v16.4s, v17.4s +; CHECK-GI-NEXT: ret + %c = sext <16 x i8> %a to <16 x i64> + %d = sext <16 x i8> %b to <16 x i64> + %e = add <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <16 x i64> @extsubu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) { +; CHECK-SD-LABEL: extsubu_v16i8_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: usubl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: usubl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: sshll v3.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v5.4s, v0.4h, #0 +; CHECK-SD-NEXT: sshll2 v6.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.2d, v3.4s, #0 +; CHECK-SD-NEXT: sshll v0.2d, v3.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: sshll v4.2d, v5.2s, #0 +; CHECK-SD-NEXT: sshll2 v7.2d, v6.4s, #0 +; CHECK-SD-NEXT: sshll2 v5.2d, v5.4s, #0 +; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubu_v16i8_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll v6.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v16.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v17.4s, v1.8h, #0 +; CHECK-GI-NEXT: usubl v0.2d, v4.2s, v2.2s +; CHECK-GI-NEXT: usubl2 v1.2d, v4.4s, v2.4s +; CHECK-GI-NEXT: usubl v2.2d, v5.2s, v3.2s +; CHECK-GI-NEXT: usubl2 v3.2d, v5.4s, v3.4s +; CHECK-GI-NEXT: usubl v4.2d, v6.2s, v7.2s +; CHECK-GI-NEXT: usubl2 v5.2d, v6.4s, v7.4s +; CHECK-GI-NEXT: usubl v6.2d, v16.2s, v17.2s +; CHECK-GI-NEXT: usubl2 v7.2d, v16.4s, v17.4s +; CHECK-GI-NEXT: ret + %c = zext <16 x i8> %a to <16 x i64> + %d = zext <16 x i8> %b to <16 x i64> + %e = sub <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <16 x i64> @extsubs_v16i8_i64(<16 x i8> %a, <16 x i8> %b) { +; CHECK-SD-LABEL: extsubs_v16i8_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ssubl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: ssubl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: sshll v3.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v5.4s, v0.4h, #0 +; CHECK-SD-NEXT: sshll2 v6.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.2d, v3.4s, #0 +; CHECK-SD-NEXT: sshll v0.2d, v3.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: sshll v4.2d, v5.2s, #0 +; CHECK-SD-NEXT: sshll2 v7.2d, v6.4s, #0 +; CHECK-SD-NEXT: sshll2 v5.2d, v5.4s, #0 +; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubs_v16i8_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll v6.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v17.4s, v1.8h, #0 +; CHECK-GI-NEXT: ssubl v0.2d, v4.2s, v2.2s +; CHECK-GI-NEXT: ssubl2 v1.2d, v4.4s, v2.4s +; CHECK-GI-NEXT: ssubl v2.2d, v5.2s, v3.2s +; CHECK-GI-NEXT: ssubl2 v3.2d, v5.4s, v3.4s +; CHECK-GI-NEXT: ssubl v4.2d, v6.2s, v7.2s +; CHECK-GI-NEXT: ssubl2 v5.2d, v6.4s, v7.4s +; CHECK-GI-NEXT: ssubl v6.2d, v16.2s, v17.2s +; CHECK-GI-NEXT: ssubl2 v7.2d, v16.4s, v17.4s +; CHECK-GI-NEXT: ret + %c = sext <16 x i8> %a to <16 x i64> + %d = sext <16 x i8> %b to <16 x i64> + %e = sub <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <16 x i64> @extaddu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) { +; CHECK-SD-LABEL: extaddu_v16i16_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: uaddl v5.4s, v1.4h, v3.4h +; CHECK-SD-NEXT: uaddl v4.4s, v0.4h, v2.4h +; CHECK-SD-NEXT: uaddl2 v2.4s, v0.8h, v2.8h +; CHECK-SD-NEXT: uaddl2 v6.4s, v1.8h, v3.8h +; CHECK-SD-NEXT: ushll2 v1.2d, v4.4s, #0 +; CHECK-SD-NEXT: ushll v0.2d, v4.2s, #0 +; CHECK-SD-NEXT: ushll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: ushll v4.2d, v5.2s, #0 +; CHECK-SD-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-SD-NEXT: ushll2 v5.2d, v5.4s, #0 +; CHECK-SD-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v16i16_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v16.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll v17.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll2 v18.4s, v1.8h, #0 +; CHECK-GI-NEXT: ushll2 v19.4s, v3.8h, #0 +; CHECK-GI-NEXT: uaddl v0.2d, v4.2s, v6.2s +; CHECK-GI-NEXT: uaddl2 v1.2d, v4.4s, v6.4s +; CHECK-GI-NEXT: uaddl v2.2d, v5.2s, v16.2s +; CHECK-GI-NEXT: uaddl2 v3.2d, v5.4s, v16.4s +; CHECK-GI-NEXT: uaddl v4.2d, v7.2s, v17.2s +; CHECK-GI-NEXT: uaddl2 v5.2d, v7.4s, v17.4s +; CHECK-GI-NEXT: uaddl v6.2d, v18.2s, v19.2s +; CHECK-GI-NEXT: uaddl2 v7.2d, v18.4s, v19.4s +; CHECK-GI-NEXT: ret + %c = zext <16 x i16> %a to <16 x i64> + %d = zext <16 x i16> %b to <16 x i64> + %e = add <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <16 x i64> @extadds_v16i16_i64(<16 x i16> %a, <16 x i16> %b) { +; CHECK-SD-LABEL: extadds_v16i16_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saddl v5.4s, v1.4h, v3.4h +; CHECK-SD-NEXT: saddl v4.4s, v0.4h, v2.4h +; CHECK-SD-NEXT: saddl2 v2.4s, v0.8h, v2.8h +; CHECK-SD-NEXT: saddl2 v6.4s, v1.8h, v3.8h +; CHECK-SD-NEXT: sshll2 v1.2d, v4.4s, #0 +; CHECK-SD-NEXT: sshll v0.2d, v4.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: sshll v4.2d, v5.2s, #0 +; CHECK-SD-NEXT: sshll2 v7.2d, v6.4s, #0 +; CHECK-SD-NEXT: sshll2 v5.2d, v5.4s, #0 +; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v16i16_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshll v4.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v6.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v17.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v18.4s, v1.8h, #0 +; CHECK-GI-NEXT: sshll2 v19.4s, v3.8h, #0 +; CHECK-GI-NEXT: saddl v0.2d, v4.2s, v6.2s +; CHECK-GI-NEXT: saddl2 v1.2d, v4.4s, v6.4s +; CHECK-GI-NEXT: saddl v2.2d, v5.2s, v16.2s +; CHECK-GI-NEXT: saddl2 v3.2d, v5.4s, v16.4s +; CHECK-GI-NEXT: saddl v4.2d, v7.2s, v17.2s +; CHECK-GI-NEXT: saddl2 v5.2d, v7.4s, v17.4s +; CHECK-GI-NEXT: saddl v6.2d, v18.2s, v19.2s +; CHECK-GI-NEXT: saddl2 v7.2d, v18.4s, v19.4s +; CHECK-GI-NEXT: ret + %c = sext <16 x i16> %a to <16 x i64> + %d = sext <16 x i16> %b to <16 x i64> + %e = add <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <16 x i64> @extsubu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) { +; CHECK-SD-LABEL: extsubu_v16i16_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: usubl v5.4s, v1.4h, v3.4h +; CHECK-SD-NEXT: usubl v4.4s, v0.4h, v2.4h +; CHECK-SD-NEXT: usubl2 v2.4s, v0.8h, v2.8h +; CHECK-SD-NEXT: usubl2 v6.4s, v1.8h, v3.8h +; CHECK-SD-NEXT: sshll2 v1.2d, v4.4s, #0 +; CHECK-SD-NEXT: sshll v0.2d, v4.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: sshll v4.2d, v5.2s, #0 +; CHECK-SD-NEXT: sshll2 v7.2d, v6.4s, #0 +; CHECK-SD-NEXT: sshll2 v5.2d, v5.4s, #0 +; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubu_v16i16_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v16.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll v17.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll2 v18.4s, v1.8h, #0 +; CHECK-GI-NEXT: ushll2 v19.4s, v3.8h, #0 +; CHECK-GI-NEXT: usubl v0.2d, v4.2s, v6.2s +; CHECK-GI-NEXT: usubl2 v1.2d, v4.4s, v6.4s +; CHECK-GI-NEXT: usubl v2.2d, v5.2s, v16.2s +; CHECK-GI-NEXT: usubl2 v3.2d, v5.4s, v16.4s +; CHECK-GI-NEXT: usubl v4.2d, v7.2s, v17.2s +; CHECK-GI-NEXT: usubl2 v5.2d, v7.4s, v17.4s +; CHECK-GI-NEXT: usubl v6.2d, v18.2s, v19.2s +; CHECK-GI-NEXT: usubl2 v7.2d, v18.4s, v19.4s +; CHECK-GI-NEXT: ret + %c = zext <16 x i16> %a to <16 x i64> + %d = zext <16 x i16> %b to <16 x i64> + %e = sub <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <16 x i64> @extsubs_v16i16_i64(<16 x i16> %a, <16 x i16> %b) { +; CHECK-SD-LABEL: extsubs_v16i16_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ssubl v5.4s, v1.4h, v3.4h +; CHECK-SD-NEXT: ssubl v4.4s, v0.4h, v2.4h +; CHECK-SD-NEXT: ssubl2 v2.4s, v0.8h, v2.8h +; CHECK-SD-NEXT: ssubl2 v6.4s, v1.8h, v3.8h +; CHECK-SD-NEXT: sshll2 v1.2d, v4.4s, #0 +; CHECK-SD-NEXT: sshll v0.2d, v4.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: sshll v4.2d, v5.2s, #0 +; CHECK-SD-NEXT: sshll2 v7.2d, v6.4s, #0 +; CHECK-SD-NEXT: sshll2 v5.2d, v5.4s, #0 +; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubs_v16i16_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshll v4.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v6.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v17.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v18.4s, v1.8h, #0 +; CHECK-GI-NEXT: sshll2 v19.4s, v3.8h, #0 +; CHECK-GI-NEXT: ssubl v0.2d, v4.2s, v6.2s +; CHECK-GI-NEXT: ssubl2 v1.2d, v4.4s, v6.4s +; CHECK-GI-NEXT: ssubl v2.2d, v5.2s, v16.2s +; CHECK-GI-NEXT: ssubl2 v3.2d, v5.4s, v16.4s +; CHECK-GI-NEXT: ssubl v4.2d, v7.2s, v17.2s +; CHECK-GI-NEXT: ssubl2 v5.2d, v7.4s, v17.4s +; CHECK-GI-NEXT: ssubl v6.2d, v18.2s, v19.2s +; CHECK-GI-NEXT: ssubl2 v7.2d, v18.4s, v19.4s +; CHECK-GI-NEXT: ret + %c = sext <16 x i16> %a to <16 x i64> + %d = sext <16 x i16> %b to <16 x i64> + %e = sub <16 x i64> %c, %d + ret <16 x i64> %e +} + define <4 x i32> @extadds_v4i16_i32(<4 x i16> %s0, <4 x i16> %s1) { ; CHECK-LABEL: extadds_v4i16_i32: ; CHECK: // %bb.0: // %entry @@ -212,12 +841,19 @@ entry: } define <8 x i32> @extadds_v8i16_i32(<8 x i16> %s0, <8 x i16> %s1) { -; CHECK-LABEL: extadds_v8i16_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl2 v2.4s, v0.8h, v1.8h -; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h -; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v8i16_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: saddl v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v8i16_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: saddl v2.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: saddl2 v1.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i16> %s0 to <8 x i32> %s1s = sext <8 x i16> %s1 to <8 x i32> @@ -226,12 +862,19 @@ entry: } define <8 x i32> @extaddu_v8i16_i32(<8 x i16> %s0, <8 x i16> %s1) { -; CHECK-LABEL: extaddu_v8i16_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl2 v2.4s, v0.8h, v1.8h -; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h -; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v8i16_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v8i16_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: uaddl v2.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uaddl2 v1.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i16> %s0 to <8 x i32> %s1s = zext <8 x i16> %s1 to <8 x i32> @@ -240,16 +883,26 @@ entry: } define <16 x i32> @extadds_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) { -; CHECK-LABEL: extadds_v16i16_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl2 v4.4s, v1.8h, v3.8h -; CHECK-NEXT: saddl v5.4s, v0.4h, v2.4h -; CHECK-NEXT: saddl2 v6.4s, v0.8h, v2.8h -; CHECK-NEXT: saddl v2.4s, v1.4h, v3.4h -; CHECK-NEXT: mov v0.16b, v5.16b -; CHECK-NEXT: mov v1.16b, v6.16b -; CHECK-NEXT: mov v3.16b, v4.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v16i16_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl2 v4.4s, v1.8h, v3.8h +; CHECK-SD-NEXT: saddl v5.4s, v0.4h, v2.4h +; CHECK-SD-NEXT: saddl2 v6.4s, v0.8h, v2.8h +; CHECK-SD-NEXT: saddl v2.4s, v1.4h, v3.4h +; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: mov v1.16b, v6.16b +; CHECK-SD-NEXT: mov v3.16b, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v16i16_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: saddl v4.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: saddl2 v5.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: saddl v2.4s, v1.4h, v3.4h +; CHECK-GI-NEXT: saddl2 v3.4s, v1.8h, v3.8h +; CHECK-GI-NEXT: mov v0.16b, v4.16b +; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: ret entry: %s0s = sext <16 x i16> %s0 to <16 x i32> %s1s = sext <16 x i16> %s1 to <16 x i32> @@ -258,16 +911,26 @@ entry: } define <16 x i32> @extaddu_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) { -; CHECK-LABEL: extaddu_v16i16_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl2 v4.4s, v1.8h, v3.8h -; CHECK-NEXT: uaddl v5.4s, v0.4h, v2.4h -; CHECK-NEXT: uaddl2 v6.4s, v0.8h, v2.8h -; CHECK-NEXT: uaddl v2.4s, v1.4h, v3.4h -; CHECK-NEXT: mov v0.16b, v5.16b -; CHECK-NEXT: mov v1.16b, v6.16b -; CHECK-NEXT: mov v3.16b, v4.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v16i16_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl2 v4.4s, v1.8h, v3.8h +; CHECK-SD-NEXT: uaddl v5.4s, v0.4h, v2.4h +; CHECK-SD-NEXT: uaddl2 v6.4s, v0.8h, v2.8h +; CHECK-SD-NEXT: uaddl v2.4s, v1.4h, v3.4h +; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: mov v1.16b, v6.16b +; CHECK-SD-NEXT: mov v3.16b, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v16i16_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: uaddl v4.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: uaddl2 v5.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: uaddl v2.4s, v1.4h, v3.4h +; CHECK-GI-NEXT: uaddl2 v3.4s, v1.8h, v3.8h +; CHECK-GI-NEXT: mov v0.16b, v4.16b +; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i16> %s0 to <16 x i32> %s1s = zext <16 x i16> %s1 to <16 x i32> @@ -276,12 +939,20 @@ entry: } define <4 x i64> @extadds_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) { -; CHECK-LABEL: extadds_v4i16_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h -; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0 -; CHECK-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v4i16_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: sshll2 v1.2d, v0.4s, #0 +; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v4i16_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: saddl v0.2d, v2.2s, v1.2s +; CHECK-GI-NEXT: saddl2 v1.2d, v2.4s, v1.4s +; CHECK-GI-NEXT: ret entry: %s0s = sext <4 x i16> %s0 to <4 x i64> %s1s = sext <4 x i16> %s1 to <4 x i64> @@ -290,12 +961,20 @@ entry: } define <4 x i64> @extaddu_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) { -; CHECK-LABEL: extaddu_v4i16_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h -; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v4i16_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v4i16_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: uaddl v0.2d, v2.2s, v1.2s +; CHECK-GI-NEXT: uaddl2 v1.2d, v2.4s, v1.4s +; CHECK-GI-NEXT: ret entry: %s0s = zext <4 x i16> %s0 to <4 x i64> %s1s = zext <4 x i16> %s1 to <4 x i64> @@ -304,15 +983,27 @@ entry: } define <8 x i64> @extadds_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { -; CHECK-LABEL: extadds_v8i16_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl v2.4s, v0.4h, v1.4h -; CHECK-NEXT: saddl2 v4.4s, v0.8h, v1.8h -; CHECK-NEXT: sshll v0.2d, v2.2s, #0 -; CHECK-NEXT: sshll2 v3.2d, v4.4s, #0 -; CHECK-NEXT: sshll2 v1.2d, v2.4s, #0 -; CHECK-NEXT: sshll v2.2d, v4.2s, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v8i16_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl v2.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: saddl2 v4.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: sshll v0.2d, v2.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v4.4s, #0 +; CHECK-SD-NEXT: sshll2 v1.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v4.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v8i16_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: saddl v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: saddl2 v1.2d, v2.4s, v3.4s +; CHECK-GI-NEXT: saddl v2.2d, v4.2s, v5.2s +; CHECK-GI-NEXT: saddl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i16> %s0 to <8 x i64> %s1s = sext <8 x i16> %s1 to <8 x i64> @@ -321,15 +1012,27 @@ entry: } define <8 x i64> @extaddu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { -; CHECK-LABEL: extaddu_v8i16_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl v2.4s, v0.4h, v1.4h -; CHECK-NEXT: uaddl2 v4.4s, v0.8h, v1.8h -; CHECK-NEXT: ushll v0.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v3.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v2.4s, #0 -; CHECK-NEXT: ushll v2.2d, v4.2s, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v8i16_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl v2.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uaddl2 v4.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: ushll v0.2d, v2.2s, #0 +; CHECK-SD-NEXT: ushll2 v3.2d, v4.4s, #0 +; CHECK-SD-NEXT: ushll2 v1.2d, v2.4s, #0 +; CHECK-SD-NEXT: ushll v2.2d, v4.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v8i16_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: uaddl v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: uaddl2 v1.2d, v2.4s, v3.4s +; CHECK-GI-NEXT: uaddl v2.2d, v4.2s, v5.2s +; CHECK-GI-NEXT: uaddl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i16> %s0 to <8 x i64> %s1s = zext <8 x i16> %s1 to <8 x i64> @@ -337,6 +1040,64 @@ entry: ret <8 x i64> %m } +define <8 x i64> @extsubs_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { +; CHECK-SD-LABEL: extsubs_v8i16_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ssubl v2.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: ssubl2 v4.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: sshll v0.2d, v2.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v4.4s, #0 +; CHECK-SD-NEXT: sshll2 v1.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v4.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubs_v8i16_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: ssubl v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: ssubl2 v1.2d, v2.4s, v3.4s +; CHECK-GI-NEXT: ssubl v2.2d, v4.2s, v5.2s +; CHECK-GI-NEXT: ssubl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ret +entry: + %s0s = sext <8 x i16> %s0 to <8 x i64> + %s1s = sext <8 x i16> %s1 to <8 x i64> + %m = sub <8 x i64> %s0s, %s1s + ret <8 x i64> %m +} + +define <8 x i64> @extsubu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { +; CHECK-SD-LABEL: extsubu_v8i16_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl v2.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: usubl2 v4.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: sshll v0.2d, v2.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v4.4s, #0 +; CHECK-SD-NEXT: sshll2 v1.2d, v2.4s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v4.2s, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extsubu_v8i16_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: usubl v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: usubl2 v1.2d, v2.4s, v3.4s +; CHECK-GI-NEXT: usubl v2.2d, v4.2s, v5.2s +; CHECK-GI-NEXT: usubl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ret +entry: + %s0s = zext <8 x i16> %s0 to <8 x i64> + %s1s = zext <8 x i16> %s1 to <8 x i64> + %m = sub <8 x i64> %s0s, %s1s + ret <8 x i64> %m +} + define <2 x i64> @extadds_v2i32_i64(<2 x i32> %s0, <2 x i32> %s1) { ; CHECK-LABEL: extadds_v2i32_i64: ; CHECK: // %bb.0: // %entry @@ -362,12 +1123,19 @@ entry: } define <4 x i64> @extadds_v4i32_i64(<4 x i32> %s0, <4 x i32> %s1) { -; CHECK-LABEL: extadds_v4i32_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl2 v2.2d, v0.4s, v1.4s -; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s -; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v4i32_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s +; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v4i32_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: saddl v2.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: saddl2 v1.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %s0s = sext <4 x i32> %s0 to <4 x i64> %s1s = sext <4 x i32> %s1 to <4 x i64> @@ -376,12 +1144,19 @@ entry: } define <4 x i64> @extaddu_v4i32_i64(<4 x i32> %s0, <4 x i32> %s1) { -; CHECK-LABEL: extaddu_v4i32_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl2 v2.2d, v0.4s, v1.4s -; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s -; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v4i32_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s +; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v4i32_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: uaddl v2.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uaddl2 v1.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %s0s = zext <4 x i32> %s0 to <4 x i64> %s1s = zext <4 x i32> %s1 to <4 x i64> @@ -390,16 +1165,26 @@ entry: } define <8 x i64> @extadds_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) { -; CHECK-LABEL: extadds_v8i32_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddl2 v4.2d, v1.4s, v3.4s -; CHECK-NEXT: saddl v5.2d, v0.2s, v2.2s -; CHECK-NEXT: saddl2 v6.2d, v0.4s, v2.4s -; CHECK-NEXT: saddl v2.2d, v1.2s, v3.2s -; CHECK-NEXT: mov v0.16b, v5.16b -; CHECK-NEXT: mov v1.16b, v6.16b -; CHECK-NEXT: mov v3.16b, v4.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extadds_v8i32_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: saddl2 v4.2d, v1.4s, v3.4s +; CHECK-SD-NEXT: saddl v5.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: saddl2 v6.2d, v0.4s, v2.4s +; CHECK-SD-NEXT: saddl v2.2d, v1.2s, v3.2s +; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: mov v1.16b, v6.16b +; CHECK-SD-NEXT: mov v3.16b, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extadds_v8i32_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: saddl v4.2d, v0.2s, v2.2s +; CHECK-GI-NEXT: saddl2 v5.2d, v0.4s, v2.4s +; CHECK-GI-NEXT: saddl v2.2d, v1.2s, v3.2s +; CHECK-GI-NEXT: saddl2 v3.2d, v1.4s, v3.4s +; CHECK-GI-NEXT: mov v0.16b, v4.16b +; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i32> %s0 to <8 x i64> %s1s = sext <8 x i32> %s1 to <8 x i64> @@ -408,16 +1193,26 @@ entry: } define <8 x i64> @extaddu_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) { -; CHECK-LABEL: extaddu_v8i32_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl2 v4.2d, v1.4s, v3.4s -; CHECK-NEXT: uaddl v5.2d, v0.2s, v2.2s -; CHECK-NEXT: uaddl2 v6.2d, v0.4s, v2.4s -; CHECK-NEXT: uaddl v2.2d, v1.2s, v3.2s -; CHECK-NEXT: mov v0.16b, v5.16b -; CHECK-NEXT: mov v1.16b, v6.16b -; CHECK-NEXT: mov v3.16b, v4.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extaddu_v8i32_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl2 v4.2d, v1.4s, v3.4s +; CHECK-SD-NEXT: uaddl v5.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: uaddl2 v6.2d, v0.4s, v2.4s +; CHECK-SD-NEXT: uaddl v2.2d, v1.2s, v3.2s +; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: mov v1.16b, v6.16b +; CHECK-SD-NEXT: mov v3.16b, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extaddu_v8i32_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: uaddl v4.2d, v0.2s, v2.2s +; CHECK-GI-NEXT: uaddl2 v5.2d, v0.4s, v2.4s +; CHECK-GI-NEXT: uaddl v2.2d, v1.2s, v3.2s +; CHECK-GI-NEXT: uaddl2 v3.2d, v1.4s, v3.4s +; CHECK-GI-NEXT: mov v0.16b, v4.16b +; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i32> %s0 to <8 x i64> %s1s = zext <8 x i32> %s1 to <8 x i64> @@ -426,17 +1221,33 @@ entry: } define <16 x i32> @add_zs(<16 x i8> %s0, <16 x i8> %s1) { -; CHECK-LABEL: add_zs: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: saddw v2.8h, v2.8h, v1.8b -; CHECK-NEXT: saddw2 v4.8h, v0.8h, v1.16b -; CHECK-NEXT: sshll v0.4s, v2.4h, #0 -; CHECK-NEXT: sshll2 v3.4s, v4.8h, #0 -; CHECK-NEXT: sshll2 v1.4s, v2.8h, #0 -; CHECK-NEXT: sshll v2.4s, v4.4h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_zs: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-SD-NEXT: saddw v2.8h, v2.8h, v1.8b +; CHECK-SD-NEXT: saddw2 v4.8h, v0.8h, v1.16b +; CHECK-SD-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v3.4s, v4.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v2.4s, v4.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_zs: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll v3.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll2 v4.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v6.4s, v1.8h, #0 +; CHECK-GI-NEXT: uaddw v0.4s, v0.4s, v3.4h +; CHECK-GI-NEXT: uaddw2 v1.4s, v2.4s, v3.8h +; CHECK-GI-NEXT: uaddw v2.4s, v5.4s, v4.4h +; CHECK-GI-NEXT: uaddw2 v3.4s, v6.4s, v4.8h +; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> %s1s = sext <16 x i8> %s1 to <16 x i32> @@ -445,87 +1256,174 @@ entry: } define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { -; CHECK-LABEL: v20: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldr b2, [sp, #160] -; CHECK-NEXT: add x10, sp, #168 -; CHECK-NEXT: ldr b3, [sp] -; CHECK-NEXT: add x11, sp, #8 -; CHECK-NEXT: ldr b1, [sp, #96] -; CHECK-NEXT: ld1 { v2.b }[1], [x10] -; CHECK-NEXT: add x9, sp, #104 -; CHECK-NEXT: add x10, sp, #176 -; CHECK-NEXT: mov v0.b[1], w1 -; CHECK-NEXT: ld1 { v3.b }[1], [x11] -; CHECK-NEXT: ld1 { v1.b }[1], [x9] -; CHECK-NEXT: add x11, sp, #16 -; CHECK-NEXT: add x9, sp, #112 -; CHECK-NEXT: add x13, sp, #184 -; CHECK-NEXT: ld1 { v2.b }[2], [x10] -; CHECK-NEXT: add x12, sp, #120 -; CHECK-NEXT: add x14, sp, #32 -; CHECK-NEXT: ld1 { v3.b }[2], [x11] -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: ldr b5, [sp, #64] -; CHECK-NEXT: mov v0.b[2], w2 -; CHECK-NEXT: ldr b4, [sp, #224] -; CHECK-NEXT: add x11, sp, #128 -; CHECK-NEXT: ld1 { v2.b }[3], [x13] -; CHECK-NEXT: add x13, sp, #24 -; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v3.b }[3], [x13] -; CHECK-NEXT: ld1 { v1.b }[3], [x12] -; CHECK-NEXT: add x12, sp, #192 -; CHECK-NEXT: add x13, sp, #200 -; CHECK-NEXT: add x15, sp, #80 -; CHECK-NEXT: add x9, sp, #144 -; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v2.b }[4], [x12] -; CHECK-NEXT: add x12, sp, #232 -; CHECK-NEXT: ld1 { v3.b }[4], [x14] -; CHECK-NEXT: add x14, sp, #72 -; CHECK-NEXT: ld1 { v4.b }[1], [x12] -; CHECK-NEXT: ld1 { v5.b }[1], [x14] -; CHECK-NEXT: add x14, sp, #40 -; CHECK-NEXT: ld1 { v1.b }[4], [x11] -; CHECK-NEXT: ld1 { v2.b }[5], [x13] -; CHECK-NEXT: add x12, sp, #208 -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: mov v0.b[4], w4 -; CHECK-NEXT: ld1 { v3.b }[5], [x14] -; CHECK-NEXT: add x14, sp, #240 -; CHECK-NEXT: ld1 { v4.b }[2], [x14] -; CHECK-NEXT: ld1 { v5.b }[2], [x15] -; CHECK-NEXT: ld1 { v1.b }[5], [x10] -; CHECK-NEXT: ld1 { v2.b }[6], [x12] -; CHECK-NEXT: add x11, sp, #216 -; CHECK-NEXT: add x10, sp, #56 -; CHECK-NEXT: ld1 { v3.b }[6], [x13] -; CHECK-NEXT: add x12, sp, #248 -; CHECK-NEXT: add x13, sp, #88 -; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: ld1 { v4.b }[3], [x12] -; CHECK-NEXT: ld1 { v5.b }[3], [x13] -; CHECK-NEXT: ld1 { v1.b }[6], [x9] -; CHECK-NEXT: ld1 { v2.b }[7], [x11] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ld1 { v3.b }[7], [x10] -; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b -; CHECK-NEXT: mov v0.b[6], w6 -; CHECK-NEXT: ld1 { v1.b }[7], [x9] -; CHECK-NEXT: uaddl v2.8h, v3.8b, v2.8b -; CHECK-NEXT: ushll v3.4s, v4.4h, #0 -; CHECK-NEXT: mov v0.b[7], w7 -; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushll2 v1.4s, v2.8h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: stp q1, q3, [x8, #48] -; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: stp q3, q2, [x8, #16] -; CHECK-NEXT: str q0, [x8] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v20: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: ldr b2, [sp, #160] +; CHECK-SD-NEXT: add x10, sp, #168 +; CHECK-SD-NEXT: ldr b3, [sp] +; CHECK-SD-NEXT: add x11, sp, #8 +; CHECK-SD-NEXT: ldr b1, [sp, #96] +; CHECK-SD-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-SD-NEXT: add x9, sp, #104 +; CHECK-SD-NEXT: add x10, sp, #176 +; CHECK-SD-NEXT: mov v0.b[1], w1 +; CHECK-SD-NEXT: ld1 { v3.b }[1], [x11] +; CHECK-SD-NEXT: ld1 { v1.b }[1], [x9] +; CHECK-SD-NEXT: add x11, sp, #16 +; CHECK-SD-NEXT: add x9, sp, #112 +; CHECK-SD-NEXT: add x13, sp, #184 +; CHECK-SD-NEXT: ld1 { v2.b }[2], [x10] +; CHECK-SD-NEXT: add x12, sp, #120 +; CHECK-SD-NEXT: add x14, sp, #32 +; CHECK-SD-NEXT: ld1 { v3.b }[2], [x11] +; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-SD-NEXT: ldr b5, [sp, #64] +; CHECK-SD-NEXT: mov v0.b[2], w2 +; CHECK-SD-NEXT: ldr b4, [sp, #224] +; CHECK-SD-NEXT: add x11, sp, #128 +; CHECK-SD-NEXT: ld1 { v2.b }[3], [x13] +; CHECK-SD-NEXT: add x13, sp, #24 +; CHECK-SD-NEXT: add x10, sp, #136 +; CHECK-SD-NEXT: ld1 { v3.b }[3], [x13] +; CHECK-SD-NEXT: ld1 { v1.b }[3], [x12] +; CHECK-SD-NEXT: add x12, sp, #192 +; CHECK-SD-NEXT: add x13, sp, #200 +; CHECK-SD-NEXT: add x15, sp, #80 +; CHECK-SD-NEXT: add x9, sp, #144 +; CHECK-SD-NEXT: mov v0.b[3], w3 +; CHECK-SD-NEXT: ld1 { v2.b }[4], [x12] +; CHECK-SD-NEXT: add x12, sp, #232 +; CHECK-SD-NEXT: ld1 { v3.b }[4], [x14] +; CHECK-SD-NEXT: add x14, sp, #72 +; CHECK-SD-NEXT: ld1 { v4.b }[1], [x12] +; CHECK-SD-NEXT: ld1 { v5.b }[1], [x14] +; CHECK-SD-NEXT: add x14, sp, #40 +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x11] +; CHECK-SD-NEXT: ld1 { v2.b }[5], [x13] +; CHECK-SD-NEXT: add x12, sp, #208 +; CHECK-SD-NEXT: add x13, sp, #48 +; CHECK-SD-NEXT: mov v0.b[4], w4 +; CHECK-SD-NEXT: ld1 { v3.b }[5], [x14] +; CHECK-SD-NEXT: add x14, sp, #240 +; CHECK-SD-NEXT: ld1 { v4.b }[2], [x14] +; CHECK-SD-NEXT: ld1 { v5.b }[2], [x15] +; CHECK-SD-NEXT: ld1 { v1.b }[5], [x10] +; CHECK-SD-NEXT: ld1 { v2.b }[6], [x12] +; CHECK-SD-NEXT: add x11, sp, #216 +; CHECK-SD-NEXT: add x10, sp, #56 +; CHECK-SD-NEXT: ld1 { v3.b }[6], [x13] +; CHECK-SD-NEXT: add x12, sp, #248 +; CHECK-SD-NEXT: add x13, sp, #88 +; CHECK-SD-NEXT: mov v0.b[5], w5 +; CHECK-SD-NEXT: ld1 { v4.b }[3], [x12] +; CHECK-SD-NEXT: ld1 { v5.b }[3], [x13] +; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-SD-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-SD-NEXT: add x9, sp, #152 +; CHECK-SD-NEXT: ld1 { v3.b }[7], [x10] +; CHECK-SD-NEXT: uaddl v4.8h, v5.8b, v4.8b +; CHECK-SD-NEXT: mov v0.b[6], w6 +; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9] +; CHECK-SD-NEXT: uaddl v2.8h, v3.8b, v2.8b +; CHECK-SD-NEXT: ushll v3.4s, v4.4h, #0 +; CHECK-SD-NEXT: mov v0.b[7], w7 +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-SD-NEXT: stp q1, q3, [x8, #48] +; CHECK-SD-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: stp q3, q2, [x8, #16] +; CHECK-SD-NEXT: str q0, [x8] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v20: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr s0, [sp] +; CHECK-GI-NEXT: ldr s4, [sp, #8] +; CHECK-GI-NEXT: fmov s1, w0 +; CHECK-GI-NEXT: ldr s18, [sp, #16] +; CHECK-GI-NEXT: ldr s2, [sp, #32] +; CHECK-GI-NEXT: fmov s3, w4 +; CHECK-GI-NEXT: mov v0.s[1], v4.s[0] +; CHECK-GI-NEXT: ldr s16, [sp, #40] +; CHECK-GI-NEXT: ldr s4, [sp, #64] +; CHECK-GI-NEXT: ldr s19, [sp, #72] +; CHECK-GI-NEXT: ldr s21, [sp, #104] +; CHECK-GI-NEXT: mov v1.s[1], w1 +; CHECK-GI-NEXT: mov v2.s[1], v16.s[0] +; CHECK-GI-NEXT: ldr s16, [sp, #96] +; CHECK-GI-NEXT: ldr s22, [sp, #136] +; CHECK-GI-NEXT: mov v3.s[1], w5 +; CHECK-GI-NEXT: ldr s20, [sp, #48] +; CHECK-GI-NEXT: mov v4.s[1], v19.s[0] +; CHECK-GI-NEXT: mov v0.s[2], v18.s[0] +; CHECK-GI-NEXT: ldr s18, [sp, #128] +; CHECK-GI-NEXT: ldr s19, [sp, #160] +; CHECK-GI-NEXT: ldr s24, [sp, #168] +; CHECK-GI-NEXT: mov v16.s[1], v21.s[0] +; CHECK-GI-NEXT: ldr s21, [sp, #192] +; CHECK-GI-NEXT: mov v18.s[1], v22.s[0] +; CHECK-GI-NEXT: ldr s25, [sp, #200] +; CHECK-GI-NEXT: ldr s22, [sp, #224] +; CHECK-GI-NEXT: ldr s26, [sp, #232] +; CHECK-GI-NEXT: ldr s23, [sp, #112] +; CHECK-GI-NEXT: mov v19.s[1], v24.s[0] +; CHECK-GI-NEXT: mov v2.s[2], v20.s[0] +; CHECK-GI-NEXT: ldr s20, [sp, #144] +; CHECK-GI-NEXT: ldr s17, [sp, #80] +; CHECK-GI-NEXT: mov v21.s[1], v25.s[0] +; CHECK-GI-NEXT: mov v22.s[1], v26.s[0] +; CHECK-GI-NEXT: mov v1.s[2], w2 +; CHECK-GI-NEXT: mov v3.s[2], w6 +; CHECK-GI-NEXT: ldr s24, [sp, #176] +; CHECK-GI-NEXT: mov v16.s[2], v23.s[0] +; CHECK-GI-NEXT: mov v18.s[2], v20.s[0] +; CHECK-GI-NEXT: mov v4.s[2], v17.s[0] +; CHECK-GI-NEXT: ldr s17, [sp, #208] +; CHECK-GI-NEXT: ldr s23, [sp, #240] +; CHECK-GI-NEXT: ldr s20, [sp, #120] +; CHECK-GI-NEXT: mov v19.s[2], v24.s[0] +; CHECK-GI-NEXT: ldr s24, [sp, #152] +; CHECK-GI-NEXT: ldr s5, [sp, #24] +; CHECK-GI-NEXT: mov v21.s[2], v17.s[0] +; CHECK-GI-NEXT: mov v22.s[2], v23.s[0] +; CHECK-GI-NEXT: mov v1.s[3], w3 +; CHECK-GI-NEXT: mov v16.s[3], v20.s[0] +; CHECK-GI-NEXT: movi v17.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: mov v3.s[3], w7 +; CHECK-GI-NEXT: mov v18.s[3], v24.s[0] +; CHECK-GI-NEXT: ldr s6, [sp, #56] +; CHECK-GI-NEXT: ldr s7, [sp, #88] +; CHECK-GI-NEXT: ldr s25, [sp, #184] +; CHECK-GI-NEXT: ldr s20, [sp, #216] +; CHECK-GI-NEXT: mov v0.s[3], v5.s[0] +; CHECK-GI-NEXT: ldr s5, [sp, #248] +; CHECK-GI-NEXT: mov v19.s[3], v25.s[0] +; CHECK-GI-NEXT: mov v2.s[3], v6.s[0] +; CHECK-GI-NEXT: mov v4.s[3], v7.s[0] +; CHECK-GI-NEXT: mov v21.s[3], v20.s[0] +; CHECK-GI-NEXT: mov v22.s[3], v5.s[0] +; CHECK-GI-NEXT: and v1.16b, v1.16b, v17.16b +; CHECK-GI-NEXT: and v5.16b, v16.16b, v17.16b +; CHECK-GI-NEXT: and v3.16b, v3.16b, v17.16b +; CHECK-GI-NEXT: and v6.16b, v18.16b, v17.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v17.16b +; CHECK-GI-NEXT: and v7.16b, v19.16b, v17.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v17.16b +; CHECK-GI-NEXT: and v4.16b, v4.16b, v17.16b +; CHECK-GI-NEXT: and v16.16b, v21.16b, v17.16b +; CHECK-GI-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-GI-NEXT: and v5.16b, v22.16b, v17.16b +; CHECK-GI-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v7.4s +; CHECK-GI-NEXT: add v2.4s, v2.4s, v16.4s +; CHECK-GI-NEXT: stp q1, q3, [x8] +; CHECK-GI-NEXT: add v1.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: stp q0, q2, [x8, #32] +; CHECK-GI-NEXT: str q1, [x8, #64] +; CHECK-GI-NEXT: ret entry: %s0s = zext <20 x i8> %s0 to <20 x i32> %s1s = zext <20 x i8> %s1 to <20 x i32> @@ -534,98 +1432,165 @@ entry: } define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) { -; CHECK-LABEL: i12: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -48 -; CHECK-NEXT: ldr w13, [sp, #112] -; CHECK-NEXT: ldr w14, [sp, #144] -; CHECK-NEXT: fmov s2, w4 -; CHECK-NEXT: ldr w17, [sp, #176] -; CHECK-NEXT: ldr w19, [sp, #208] -; CHECK-NEXT: fmov s3, w0 -; CHECK-NEXT: ldr w20, [sp, #80] -; CHECK-NEXT: ldr w21, [sp, #48] -; CHECK-NEXT: fmov s5, w13 -; CHECK-NEXT: fmov s4, w19 -; CHECK-NEXT: fmov s6, w17 -; CHECK-NEXT: fmov s7, w14 -; CHECK-NEXT: fmov s0, w20 -; CHECK-NEXT: fmov s1, w21 -; CHECK-NEXT: ldr w10, [sp, #120] -; CHECK-NEXT: ldr w11, [sp, #152] -; CHECK-NEXT: ldr w12, [sp, #184] -; CHECK-NEXT: ldr w15, [sp, #216] -; CHECK-NEXT: ldr w22, [sp, #88] -; CHECK-NEXT: ldr w23, [sp, #56] -; CHECK-NEXT: mov v2.h[1], w5 -; CHECK-NEXT: mov v3.h[1], w1 -; CHECK-NEXT: mov v5.h[1], w10 -; CHECK-NEXT: mov v4.h[1], w15 -; CHECK-NEXT: mov v0.h[1], w22 -; CHECK-NEXT: mov v1.h[1], w23 -; CHECK-NEXT: mov v6.h[1], w12 -; CHECK-NEXT: mov v7.h[1], w11 -; CHECK-NEXT: ldr w8, [sp, #128] -; CHECK-NEXT: ldr w9, [sp, #160] -; CHECK-NEXT: ldr w16, [sp, #64] -; CHECK-NEXT: ldr w18, [sp, #96] -; CHECK-NEXT: ldr w10, [sp, #192] -; CHECK-NEXT: ldr w11, [sp, #224] -; CHECK-NEXT: mov v2.h[2], w6 -; CHECK-NEXT: mov v3.h[2], w2 -; CHECK-NEXT: mov v0.h[2], w18 -; CHECK-NEXT: mov v1.h[2], w16 -; CHECK-NEXT: mov v5.h[2], w8 -; CHECK-NEXT: mov v4.h[2], w11 -; CHECK-NEXT: mov v6.h[2], w10 -; CHECK-NEXT: mov v7.h[2], w9 -; CHECK-NEXT: ldr w12, [sp, #72] -; CHECK-NEXT: ldr w13, [sp, #104] -; CHECK-NEXT: ldr w8, [sp, #136] -; CHECK-NEXT: ldr w9, [sp, #168] -; CHECK-NEXT: ldr w10, [sp, #200] -; CHECK-NEXT: ldr w11, [sp, #232] -; CHECK-NEXT: mov v0.h[3], w13 -; CHECK-NEXT: mov v1.h[3], w12 -; CHECK-NEXT: mov v2.h[3], w7 -; CHECK-NEXT: mov v3.h[3], w3 -; CHECK-NEXT: mov v5.h[3], w8 -; CHECK-NEXT: mov v4.h[3], w11 -; CHECK-NEXT: mov v6.h[3], w10 -; CHECK-NEXT: mov v7.h[3], w9 -; CHECK-NEXT: movi v16.4s, #15, msl #8 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v5.4s, v5.4h, #0 -; CHECK-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-NEXT: ushll v6.4s, v6.4h, #0 -; CHECK-NEXT: ushll v7.4s, v7.4h, #0 -; CHECK-NEXT: and v17.16b, v0.16b, v16.16b -; CHECK-NEXT: and v18.16b, v1.16b, v16.16b -; CHECK-NEXT: and v1.16b, v2.16b, v16.16b -; CHECK-NEXT: and v0.16b, v3.16b, v16.16b -; CHECK-NEXT: and v2.16b, v5.16b, v16.16b -; CHECK-NEXT: and v3.16b, v4.16b, v16.16b -; CHECK-NEXT: and v4.16b, v6.16b, v16.16b -; CHECK-NEXT: and v5.16b, v7.16b, v16.16b -; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: add v3.4s, v17.4s, v3.4s -; CHECK-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NEXT: add v2.4s, v18.4s, v4.4s -; CHECK-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: i12: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -48 +; CHECK-SD-NEXT: ldr w13, [sp, #112] +; CHECK-SD-NEXT: ldr w14, [sp, #144] +; CHECK-SD-NEXT: fmov s2, w4 +; CHECK-SD-NEXT: ldr w17, [sp, #176] +; CHECK-SD-NEXT: ldr w19, [sp, #208] +; CHECK-SD-NEXT: fmov s3, w0 +; CHECK-SD-NEXT: ldr w20, [sp, #80] +; CHECK-SD-NEXT: ldr w21, [sp, #48] +; CHECK-SD-NEXT: fmov s5, w13 +; CHECK-SD-NEXT: fmov s4, w19 +; CHECK-SD-NEXT: fmov s6, w17 +; CHECK-SD-NEXT: fmov s7, w14 +; CHECK-SD-NEXT: fmov s0, w20 +; CHECK-SD-NEXT: fmov s1, w21 +; CHECK-SD-NEXT: ldr w10, [sp, #120] +; CHECK-SD-NEXT: ldr w11, [sp, #152] +; CHECK-SD-NEXT: ldr w12, [sp, #184] +; CHECK-SD-NEXT: ldr w15, [sp, #216] +; CHECK-SD-NEXT: ldr w22, [sp, #88] +; CHECK-SD-NEXT: ldr w23, [sp, #56] +; CHECK-SD-NEXT: mov v2.h[1], w5 +; CHECK-SD-NEXT: mov v3.h[1], w1 +; CHECK-SD-NEXT: mov v5.h[1], w10 +; CHECK-SD-NEXT: mov v4.h[1], w15 +; CHECK-SD-NEXT: mov v0.h[1], w22 +; CHECK-SD-NEXT: mov v1.h[1], w23 +; CHECK-SD-NEXT: mov v6.h[1], w12 +; CHECK-SD-NEXT: mov v7.h[1], w11 +; CHECK-SD-NEXT: ldr w8, [sp, #128] +; CHECK-SD-NEXT: ldr w9, [sp, #160] +; CHECK-SD-NEXT: ldr w16, [sp, #64] +; CHECK-SD-NEXT: ldr w18, [sp, #96] +; CHECK-SD-NEXT: ldr w10, [sp, #192] +; CHECK-SD-NEXT: ldr w11, [sp, #224] +; CHECK-SD-NEXT: mov v2.h[2], w6 +; CHECK-SD-NEXT: mov v3.h[2], w2 +; CHECK-SD-NEXT: mov v0.h[2], w18 +; CHECK-SD-NEXT: mov v1.h[2], w16 +; CHECK-SD-NEXT: mov v5.h[2], w8 +; CHECK-SD-NEXT: mov v4.h[2], w11 +; CHECK-SD-NEXT: mov v6.h[2], w10 +; CHECK-SD-NEXT: mov v7.h[2], w9 +; CHECK-SD-NEXT: ldr w12, [sp, #72] +; CHECK-SD-NEXT: ldr w13, [sp, #104] +; CHECK-SD-NEXT: ldr w8, [sp, #136] +; CHECK-SD-NEXT: ldr w9, [sp, #168] +; CHECK-SD-NEXT: ldr w10, [sp, #200] +; CHECK-SD-NEXT: ldr w11, [sp, #232] +; CHECK-SD-NEXT: mov v0.h[3], w13 +; CHECK-SD-NEXT: mov v1.h[3], w12 +; CHECK-SD-NEXT: mov v2.h[3], w7 +; CHECK-SD-NEXT: mov v3.h[3], w3 +; CHECK-SD-NEXT: mov v5.h[3], w8 +; CHECK-SD-NEXT: mov v4.h[3], w11 +; CHECK-SD-NEXT: mov v6.h[3], w10 +; CHECK-SD-NEXT: mov v7.h[3], w9 +; CHECK-SD-NEXT: movi v16.4s, #15, msl #8 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-SD-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-SD-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-SD-NEXT: ushll v6.4s, v6.4h, #0 +; CHECK-SD-NEXT: ushll v7.4s, v7.4h, #0 +; CHECK-SD-NEXT: and v17.16b, v0.16b, v16.16b +; CHECK-SD-NEXT: and v18.16b, v1.16b, v16.16b +; CHECK-SD-NEXT: and v1.16b, v2.16b, v16.16b +; CHECK-SD-NEXT: and v0.16b, v3.16b, v16.16b +; CHECK-SD-NEXT: and v2.16b, v5.16b, v16.16b +; CHECK-SD-NEXT: and v3.16b, v4.16b, v16.16b +; CHECK-SD-NEXT: and v4.16b, v6.16b, v16.16b +; CHECK-SD-NEXT: and v5.16b, v7.16b, v16.16b +; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: add v3.4s, v17.4s, v3.4s +; CHECK-SD-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-SD-NEXT: add v2.4s, v18.4s, v4.4s +; CHECK-SD-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i12: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov s1, w0 +; CHECK-GI-NEXT: fmov s4, w4 +; CHECK-GI-NEXT: ldr s0, [sp] +; CHECK-GI-NEXT: ldr s20, [sp, #8] +; CHECK-GI-NEXT: ldr s2, [sp, #32] +; CHECK-GI-NEXT: ldr s21, [sp, #40] +; CHECK-GI-NEXT: ldr s16, [sp, #64] +; CHECK-GI-NEXT: ldr s22, [sp, #72] +; CHECK-GI-NEXT: ldr s17, [sp, #96] +; CHECK-GI-NEXT: ldr s23, [sp, #104] +; CHECK-GI-NEXT: mov v1.s[1], w1 +; CHECK-GI-NEXT: mov v4.s[1], w5 +; CHECK-GI-NEXT: ldr s18, [sp, #128] +; CHECK-GI-NEXT: ldr s24, [sp, #136] +; CHECK-GI-NEXT: mov v0.s[1], v20.s[0] +; CHECK-GI-NEXT: ldr s19, [sp, #160] +; CHECK-GI-NEXT: ldr s25, [sp, #168] +; CHECK-GI-NEXT: mov v2.s[1], v21.s[0] +; CHECK-GI-NEXT: mov v16.s[1], v22.s[0] +; CHECK-GI-NEXT: mov v17.s[1], v23.s[0] +; CHECK-GI-NEXT: mov v18.s[1], v24.s[0] +; CHECK-GI-NEXT: mov v19.s[1], v25.s[0] +; CHECK-GI-NEXT: ldr s6, [sp, #16] +; CHECK-GI-NEXT: ldr s7, [sp, #48] +; CHECK-GI-NEXT: ldr s20, [sp, #80] +; CHECK-GI-NEXT: ldr s21, [sp, #112] +; CHECK-GI-NEXT: ldr s22, [sp, #144] +; CHECK-GI-NEXT: ldr s23, [sp, #176] +; CHECK-GI-NEXT: mov v1.s[2], w2 +; CHECK-GI-NEXT: mov v4.s[2], w6 +; CHECK-GI-NEXT: mov v0.s[2], v6.s[0] +; CHECK-GI-NEXT: mov v2.s[2], v7.s[0] +; CHECK-GI-NEXT: mov v16.s[2], v20.s[0] +; CHECK-GI-NEXT: mov v17.s[2], v21.s[0] +; CHECK-GI-NEXT: mov v18.s[2], v22.s[0] +; CHECK-GI-NEXT: mov v19.s[2], v23.s[0] +; CHECK-GI-NEXT: ldr s3, [sp, #24] +; CHECK-GI-NEXT: ldr s5, [sp, #56] +; CHECK-GI-NEXT: ldr s6, [sp, #88] +; CHECK-GI-NEXT: ldr s7, [sp, #120] +; CHECK-GI-NEXT: ldr s20, [sp, #152] +; CHECK-GI-NEXT: ldr s21, [sp, #184] +; CHECK-GI-NEXT: mov v1.s[3], w3 +; CHECK-GI-NEXT: mov v4.s[3], w7 +; CHECK-GI-NEXT: movi v22.4s, #15, msl #8 +; CHECK-GI-NEXT: mov v0.s[3], v3.s[0] +; CHECK-GI-NEXT: mov v2.s[3], v5.s[0] +; CHECK-GI-NEXT: mov v16.s[3], v6.s[0] +; CHECK-GI-NEXT: mov v17.s[3], v7.s[0] +; CHECK-GI-NEXT: mov v18.s[3], v20.s[0] +; CHECK-GI-NEXT: mov v19.s[3], v21.s[0] +; CHECK-GI-NEXT: and v1.16b, v1.16b, v22.16b +; CHECK-GI-NEXT: and v3.16b, v4.16b, v22.16b +; CHECK-GI-NEXT: and v4.16b, v0.16b, v22.16b +; CHECK-GI-NEXT: and v5.16b, v2.16b, v22.16b +; CHECK-GI-NEXT: and v0.16b, v16.16b, v22.16b +; CHECK-GI-NEXT: and v2.16b, v17.16b, v22.16b +; CHECK-GI-NEXT: and v6.16b, v18.16b, v22.16b +; CHECK-GI-NEXT: and v7.16b, v19.16b, v22.16b +; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: add v1.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: add v2.4s, v4.4s, v6.4s +; CHECK-GI-NEXT: add v3.4s, v5.4s, v7.4s +; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i12> %s0 to <16 x i32> %s1s = zext <16 x i12> %s1 to <16 x i32> @@ -634,15 +1599,27 @@ entry: } define <16 x i32> @sub_zz(<16 x i8> %s0, <16 x i8> %s1) { -; CHECK-LABEL: sub_zz: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubl v2.8h, v0.8b, v1.8b -; CHECK-NEXT: usubl2 v4.8h, v0.16b, v1.16b -; CHECK-NEXT: sshll v0.4s, v2.4h, #0 -; CHECK-NEXT: sshll2 v3.4s, v4.8h, #0 -; CHECK-NEXT: sshll2 v1.4s, v2.8h, #0 -; CHECK-NEXT: sshll v2.4s, v4.4h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sub_zz: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: usubl2 v4.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v3.4s, v4.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v2.4s, v4.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sub_zz: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v4.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll2 v5.8h, v1.16b, #0 +; CHECK-GI-NEXT: usubl v0.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: usubl2 v1.4s, v2.8h, v3.8h +; CHECK-GI-NEXT: usubl v2.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: usubl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> %s1s = zext <16 x i8> %s1 to <16 x i32> @@ -651,15 +1628,27 @@ entry: } define <16 x i32> @sub_ss(<16 x i8> %s0, <16 x i8> %s1) { -; CHECK-LABEL: sub_ss: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ssubl v2.8h, v0.8b, v1.8b -; CHECK-NEXT: ssubl2 v4.8h, v0.16b, v1.16b -; CHECK-NEXT: sshll v0.4s, v2.4h, #0 -; CHECK-NEXT: sshll2 v3.4s, v4.8h, #0 -; CHECK-NEXT: sshll2 v1.4s, v2.8h, #0 -; CHECK-NEXT: sshll v2.4s, v4.4h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sub_ss: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ssubl v2.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: ssubl2 v4.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v3.4s, v4.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v2.4s, v4.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sub_ss: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v4.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll2 v5.8h, v1.16b, #0 +; CHECK-GI-NEXT: ssubl v0.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: ssubl2 v1.4s, v2.8h, v3.8h +; CHECK-GI-NEXT: ssubl v2.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: ssubl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ret entry: %s0s = sext <16 x i8> %s0 to <16 x i32> %s1s = sext <16 x i8> %s1 to <16 x i32> @@ -668,17 +1657,33 @@ entry: } define <16 x i32> @sub_zs(<16 x i8> %s0, <16 x i8> %s1) { -; CHECK-LABEL: sub_zs: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ssubw v2.8h, v2.8h, v1.8b -; CHECK-NEXT: ssubw2 v4.8h, v0.8h, v1.16b -; CHECK-NEXT: sshll v0.4s, v2.4h, #0 -; CHECK-NEXT: sshll2 v3.4s, v4.8h, #0 -; CHECK-NEXT: sshll2 v1.4s, v2.8h, #0 -; CHECK-NEXT: sshll v2.4s, v4.4h, #0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sub_zs: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-SD-NEXT: ssubw v2.8h, v2.8h, v1.8b +; CHECK-SD-NEXT: ssubw2 v4.8h, v0.8h, v1.16b +; CHECK-SD-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-SD-NEXT: sshll2 v3.4s, v4.8h, #0 +; CHECK-SD-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-SD-NEXT: sshll v2.4s, v4.4h, #0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sub_zs: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v4.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll v5.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v6.4s, v0.8h, #0 +; CHECK-GI-NEXT: ssubw v0.4s, v1.4s, v3.4h +; CHECK-GI-NEXT: ssubw2 v1.4s, v2.4s, v3.8h +; CHECK-GI-NEXT: ssubw v2.4s, v5.4s, v4.4h +; CHECK-GI-NEXT: ssubw2 v3.4s, v6.4s, v4.8h +; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> %s1s = sext <16 x i8> %s1 to <16 x i32> From 753ac4786e250604224701616f0962e41e163a02 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Wed, 29 May 2024 16:54:14 +0300 Subject: [PATCH 128/230] [RISCV][test] Add missing check-prefix to a test (NFC) (#93683) --- .../RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir index 7d05edd3f34132..f96d6597821788 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s --check-prefix=RV32I +# RUN: | FileCheck %s --check-prefixes=CHECK,RV32I # RUN: llc -mtriple=riscv32 -mattr=+zbb -run-pass=legalizer %s -o -\ -# RUN: | FileCheck %s --check-prefix=RV32ZBB +# RUN: | FileCheck %s --check-prefixes=CHECK,RV32ZBB --- name: abs_i8 @@ -124,10 +124,12 @@ body: | ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[COPY1]], [[C1]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[ASHR]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[ASHR]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[ASHR1]] ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[ADD2]], [[ASHR1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[ASHR]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY3]], [[ASHR1]] ; CHECK-NEXT: $x10 = COPY [[XOR]](s32) ; CHECK-NEXT: $x11 = COPY [[XOR1]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 From df9701bfee2b13282a9c1bf981d37b965cb22bf7 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 29 May 2024 08:57:16 -0500 Subject: [PATCH 129/230] [OpenMP] Fix multiply installing `libomp.so` (#93685) Summary: The `add_llvm_library` interface handles installing the llvm libraries, however we want to do our own handling. Otherwise, this will install into the `./lib` location instead of the `./lib/` one. --- openmp/runtime/src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt index 612d784be8a55c..62c35c19e6b456 100644 --- a/openmp/runtime/src/CMakeLists.txt +++ b/openmp/runtime/src/CMakeLists.txt @@ -177,6 +177,7 @@ else() add_llvm_library(omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES} PARTIAL_SOURCES_INTENDED LINK_LIBS ${LIBOMP_CONFIGURED_LIBFLAGS} ${LIBOMP_DL_LIBS} LINK_COMPONENTS Support + BUILDTREE_ONLY ) # libomp must be a C++ library such that it can link libLLVMSupport set(LIBOMP_LINKER_LANGUAGE CXX) From 7af5b68a03bb7f5090a96b3f9f9a34f0e196e466 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 29 May 2024 16:00:41 +0200 Subject: [PATCH 130/230] [DFSan] Directly create gep inbounds for arg origin tls (NFCI) Calling code explicitly checks that ArgNo is inbounds. NFCI because constant expression creation already infers it, this just makes it explicit. --- llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 20d11e0ab55f2b..f0b0917a25938c 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1804,8 +1804,8 @@ Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) { Value *DFSanFunction::getRetvalOriginTLS() { return DFS.RetvalOriginTLS; } Value *DFSanFunction::getArgOriginTLS(unsigned ArgNo, IRBuilder<> &IRB) { - return IRB.CreateConstGEP2_64(DFS.ArgOriginTLSTy, DFS.ArgOriginTLS, 0, ArgNo, - "_dfsarg_o"); + return IRB.CreateConstInBoundsGEP2_64(DFS.ArgOriginTLSTy, DFS.ArgOriginTLS, 0, + ArgNo, "_dfsarg_o"); } Value *DFSanFunction::getOrigin(Value *V) { From fbe98da623c014a3e935b1e683aecdacee17f5bd Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 29 May 2024 14:57:22 +0100 Subject: [PATCH 131/230] [AMDGPU] Fix filecheck annotation typos Co-authored-by: klensy --- .../AMDGPU/irreducible/diverged-entry-headers.ll | 2 +- llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir | 10 +++++----- llvm/test/MC/AMDGPU/hsa-diag-v4.s | 2 +- llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt | 12 ++++++------ llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 2 +- llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll index 335026dc9b62bd..efad77b684a75a 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll @@ -90,7 +90,7 @@ S: br i1 %cond.uni, label %exit, label %T T: -; CHECK-NIT: DIVERGENT: %tt.phi = phi i32 +; CHECK-NOT: DIVERGENT: %tt.phi = phi i32 %tt.phi = phi i32 [ %ss, %S ], [ %a, %entry ] %tt = add i32 %b, 1 br label %P diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 50423c59eabe94..526d5c946ec7f6 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -108,7 +108,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt } ; no-op -; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast: +; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast: ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]] ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] @@ -119,7 +119,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) % ret void } -; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast: +; HSA-LABEL: {{^}}use_constant_to_global_addrspacecast: ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]] ; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] ; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir index 29621a0477418d..1151bde02ef62c 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -4,7 +4,7 @@ --- -# GCN-label: name: vop3 +# GCN-LABEL: name: vop3 # GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec # GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec # GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec @@ -37,7 +37,7 @@ body: | ... --- -# GCN-label: name: vop3_sgpr_src1 +# GCN-LABEL: name: vop3_sgpr_src1 # GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec # GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec # GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec @@ -81,7 +81,7 @@ body: | --- # Regression test for src_modifiers on base u16 opcode -# GCN-label: name: vop3_u16 +# GCN-LABEL: name: vop3_u16 # GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec # GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec # GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec @@ -205,7 +205,7 @@ body: | ... # do not combine, dpp arg used twice -# GCN-label: name: dpp_arg_twice +# GCN-LABEL: name: dpp_arg_twice # GCN: %4:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %3, 2, %3, 1, 2, implicit $mode, implicit $exec # GCN: %6:vgpr_32 = V_FMA_F32_e64 2, %5, 2, %1, 2, %5, 1, 2, implicit $mode, implicit $exec # GCN: %8:vgpr_32 = V_FMA_F32_e64 2, %7, 2, %7, 2, %1, 1, 2, implicit $mode, implicit $exec @@ -231,7 +231,7 @@ body: | ... # when the dpp source isn't a src0 operand the operation should be commuted if possible -# GCN-label: name: dpp_commute_e64 +# GCN-LABEL: name: dpp_commute_e64 # GCN: %4:vgpr_32 = V_MUL_U32_U24_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec # GCN: %7:vgpr_32 = V_FMA_F32_e64_dpp %5, 2, %0, 1, %1, 2, %1, 1, 2, 1, 15, 15, 1, implicit $mode, implicit $exec # GCN: %10:vgpr_32 = V_SUBREV_U32_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v4.s index 069b71b7229cdd..cc10d3400e9b1a 100644 --- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-diag-v4.s @@ -54,7 +54,7 @@ // GCN-LABEL: warning: test_amdhsa_group_segment_fixed_size_repeated // AMDHSA: error: .amdhsa_ directives cannot be repeated -// NONAMDHSA-: error: unknown directive +// NONAMDHSA: error: unknown directive .warning "test_amdhsa_group_segment_fixed_size_repeated" .amdhsa_kernel test_amdhsa_group_segment_fixed_size_repeated .amdhsa_group_segment_fixed_size 1 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt index 7d15f041bd770e..78ca1bbdacf295 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt @@ -91,20 +91,20 @@ # FIXME: Results in invalid v_subrev_u16_dpp which apparently has the same encoding but does not exist in GFX10 -# gfx1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 -# gfx1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +# COM: GFX1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +# COM: GFX1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 # 0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00 # FIXME: Results in v_mul_lo_u16_dpp -# gfx1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 -# gfx1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +# COM: GFX1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +# COM: GFX1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 # 0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00 # FIXME: gives v_lshlrev_b16_dpp -# gfx1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 -# gfx1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +# COM: GFX1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +# COM: GFX1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 # 0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00 # GFX1032: v_add_co_u32 v0, s0, v0, v2 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt index 36c58d4c673263..473ede00603a78 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt @@ -1674,7 +1674,7 @@ # GFX12: ds_pk_add_f16 v0, v0 offset:4660 ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00] 0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00 -# gfx12: ds_pk_add_bf16 v2, v1 ; encoding: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00] +# GFX12: ds_pk_add_bf16 v2, v1 ; encoding: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00] 0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00 # GFX12: ds_pk_add_f16 v0, v0 offset:4660 ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00] diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll index bb370a6d1dfeb0..7f7790cecb0eb8 100644 --- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll +++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll @@ -670,7 +670,7 @@ declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) define amdgpu_cs void @test_buffer_atomic_fadd(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %offset, i1 %slc) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %slc - ; CHECK-ENXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc) + ; CHECK-NEXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc) call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc) ret void } From e8e5ba00db1b6a8ed5c988b1a252c86487d1bce7 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Wed, 29 May 2024 15:06:41 +0100 Subject: [PATCH 132/230] [AArch64][TargetParser] Move ExtensionDependencies into tablegen [NFC] (#93614) This patch generates ExtensionDependency pairs {Earlier, Later} inferred by the 'Implies' field of every Extension defined in tablegen. Implied Subtarget Features that are not Extensions are skipped. --- .../llvm/TargetParser/AArch64TargetParser.h | 51 +------------------ llvm/lib/TargetParser/AArch64TargetParser.cpp | 6 --- llvm/utils/TableGen/ARMTargetDefEmitter.cpp | 18 +++++++ 3 files changed, 20 insertions(+), 55 deletions(-) diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index b3fff3c99025a5..5025ab2491de8f 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -183,55 +183,8 @@ struct ExtensionDependency { ArchExtKind Later; }; -// clang-format off -// Each entry here is a link in the dependency chain starting from the -// extension that was added to the architecture first. -inline constexpr ExtensionDependency ExtensionDependencies[] = { - {AEK_FP, AEK_FP16}, - {AEK_FP, AEK_SIMD}, - {AEK_FP, AEK_JSCVT}, - {AEK_FP, AEK_FP8}, - {AEK_SIMD, AEK_CRYPTO}, - {AEK_SIMD, AEK_AES}, - {AEK_SIMD, AEK_SHA2}, - {AEK_SIMD, AEK_SHA3}, - {AEK_SIMD, AEK_SM4}, - {AEK_SIMD, AEK_RDM}, - {AEK_SIMD, AEK_DOTPROD}, - {AEK_SIMD, AEK_FCMA}, - {AEK_FP16, AEK_FP16FML}, - {AEK_FP16, AEK_SVE}, - {AEK_BF16, AEK_SME}, - {AEK_BF16, AEK_B16B16}, - {AEK_SVE, AEK_SVE2}, - {AEK_SVE, AEK_F32MM}, - {AEK_SVE, AEK_F64MM}, - {AEK_SVE2, AEK_SVE2P1}, - {AEK_SVE2, AEK_SVE2BITPERM}, - {AEK_SVE2, AEK_SVE2AES}, - {AEK_SVE2, AEK_SVE2SHA3}, - {AEK_SVE2, AEK_SVE2SM4}, - {AEK_SVE2, AEK_SMEFA64}, - {AEK_SVE2, AEK_SMEFA64}, - {AEK_SME, AEK_SME2}, - {AEK_SME, AEK_SMEF16F16}, - {AEK_SME, AEK_SMEF64F64}, - {AEK_SME, AEK_SMEI16I64}, - {AEK_SME, AEK_SMEFA64}, - {AEK_SME2, AEK_SME2P1}, - {AEK_SME2, AEK_SSVE_FP8FMA}, - {AEK_SME2, AEK_SSVE_FP8DOT2}, - {AEK_SME2, AEK_SSVE_FP8DOT4}, - {AEK_SME2, AEK_SMEF8F16}, - {AEK_SME2, AEK_SMEF8F32}, - {AEK_FP8, AEK_SMEF8F16}, - {AEK_FP8, AEK_SMEF8F32}, - {AEK_LSE, AEK_LSE128}, - {AEK_PREDRES, AEK_SPECRES2}, - {AEK_RAS, AEK_RASV2}, - {AEK_RCPC, AEK_RCPC3}, -}; -// clang-format on +#define EMIT_EXTENSION_DEPENDENCIES +#include "llvm/TargetParser/AArch64TargetParserDef.inc" enum ArchProfile { AProfile = 'A', RProfile = 'R', InvalidProfile = '?' }; diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index c10b4be4eded99..ca356ec82bf1f9 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -181,12 +181,6 @@ void AArch64::ExtensionSet::enable(ArchExtKind E) { !BaseArch->is_superset(ARMV9A)) enable(AEK_FP16FML); - // For all architectures, +crypto enables +aes and +sha2. - if (E == AEK_CRYPTO) { - enable(AEK_AES); - enable(AEK_SHA2); - } - // For v8.4A+ and v9.0A+, +crypto also enables +sha3 and +sm4. if (E == AEK_CRYPTO && BaseArch->is_superset(ARMV8_4A)) { enable(AEK_SHA3); diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp index b79458529623f3..5efa7d2722d3f9 100644 --- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp +++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp @@ -116,6 +116,24 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) { << "#endif // EMIT_EXTENSIONS\n" << "\n"; + // Emit extension dependencies + OS << "#ifdef EMIT_EXTENSION_DEPENDENCIES\n" + << "inline constexpr ExtensionDependency ExtensionDependencies[] = {\n"; + for (const Record *Rec : SortedExtensions) { + auto LaterAEK = Rec->getValueAsString("ArchExtKindSpelling").upper(); + for (const Record *I : Rec->getValueAsListOfDefs("Implies")) + if (auto EarlierAEK = I->getValueAsOptionalString("ArchExtKindSpelling")) + OS << " {" << EarlierAEK->upper() << ", " << LaterAEK << "},\n"; + } + // FIXME: Tablegen has the Subtarget Feature FeatureRCPC_IMMO which is implied + // by FeatureRCPC3 and in turn implies FeatureRCPC. The proper fix is to make + // FeatureRCPC_IMMO an Extension but that will expose it to the command line. + OS << " {AEK_RCPC, AEK_RCPC3},\n"; + OS << "};\n" + << "#undef EMIT_EXTENSION_DEPENDENCIES\n" + << "#endif // EMIT_EXTENSION_DEPENDENCIES\n" + << "\n"; + // Emit architecture information OS << "#ifdef EMIT_ARCHITECTURES\n"; From e20f0fe29f714a22679214b499744735d528fc1a Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 29 May 2024 16:12:32 +0200 Subject: [PATCH 133/230] [WasmEHPrepare] Explicitly create inbounds GEP (NFCI) These are known to be inbounds, create them as such. NFCI because constant expression construction currently already infers this. Also drop the unnecessary zero-index GEP: This is equivalent to the pointer itself nowadays. --- llvm/lib/CodeGen/WasmEHPrepare.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp index 1a9e1ba869c310..16c1dcb1e11753 100644 --- a/llvm/lib/CodeGen/WasmEHPrepare.cpp +++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp @@ -252,12 +252,11 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) { M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy)); LPadContextGV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel); - LPadIndexField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 0, - "lpad_index_gep"); - LSDAField = - IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 1, "lsda_gep"); - SelectorField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 2, - "selector_gep"); + LPadIndexField = LPadContextGV; + LSDAField = IRB.CreateConstInBoundsGEP2_32(LPadContextTy, LPadContextGV, 0, 1, + "lsda_gep"); + SelectorField = IRB.CreateConstInBoundsGEP2_32(LPadContextTy, LPadContextGV, + 0, 2, "selector_gep"); // wasm.landingpad.index() intrinsic, which is to specify landingpad index LPadIndexF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_landingpad_index); From 14dc97df5ef3a9178fc4175303f0f86ed4e3f98e Mon Sep 17 00:00:00 2001 From: chuongg3 Date: Wed, 29 May 2024 15:15:53 +0100 Subject: [PATCH 134/230] [AArch64][GlobalISel] Push ADD/SUB through Extend Instructions (#90964) The regression in one test is due to a SUB instruction being pushed through the extend, leaving behind the abs instruction, which prevents it from selecting uabdl instructions shown below: `i32 abs(i32 sub(i32 ext i8, i32 ext i8))` => `i32 abs(i32 ext(i16 sub(i16 ext i8, i16 ext i8)))` This is intended to be fixed in a follow up patch --- llvm/lib/Target/AArch64/AArch64Combine.td | 19 +- .../GISel/AArch64PreLegalizerCombiner.cpp | 51 ++ .../AArch64/GlobalISel/combine-add.mir | 36 +- llvm/test/CodeGen/AArch64/aarch64-addv.ll | 25 +- llvm/test/CodeGen/AArch64/arm64-vabs.ll | 82 +-- llvm/test/CodeGen/AArch64/neon-extadd.ll | 622 ++++++++---------- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 142 ++-- 7 files changed, 494 insertions(+), 483 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 1ce6cdf1c1e1ed..3f717c8a60050f 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -52,6 +52,19 @@ def ext_uaddv_to_uaddlv : GICombineRule< (apply [{ applyExtUaddvToUaddlv(*${root}, MRI, B, Observer, ${matchinfo}); }]) >; +class push_opcode_through_ext : GICombineRule < + (defs root:$root), + (match (extOpcode $ext1, $src1):$ExtMI, + (extOpcode $ext2, $src2), + (opcode $dst, $ext1, $ext2):$root, + [{ return matchPushAddSubExt(*${root}, MRI, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]), + (apply [{ applyPushAddSubExt(*${root}, MRI, B, ${ExtMI}->getOpcode() == TargetOpcode::G_SEXT, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])>; + +def push_sub_through_zext : push_opcode_through_ext; +def push_add_through_zext : push_opcode_through_ext; +def push_sub_through_sext : push_opcode_through_ext; +def push_add_through_sext : push_opcode_through_ext; + def AArch64PreLegalizerCombiner: GICombiner< "AArch64PreLegalizerCombinerImpl", [all_combines, fconstant_to_constant, @@ -59,7 +72,11 @@ def AArch64PreLegalizerCombiner: GICombiner< fold_global_offset, shuffle_to_extract, ext_addv_to_udot_addv, - ext_uaddv_to_uaddlv]> { + ext_uaddv_to_uaddlv, + push_sub_through_zext, + push_add_through_zext, + push_sub_through_sext, + push_add_through_sext]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index a82d3cd095659b..0f89fa557cd57e 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -554,6 +554,57 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, MI.eraseFromParent(); } +// Pushes ADD/SUB through extend instructions to decrease the number of extend +// instruction at the end by allowing selection of {s|u}addl sooner + +// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8)) +bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, + Register DstReg, Register SrcReg1, Register SrcReg2) { + assert(MI.getOpcode() == TargetOpcode::G_ADD || + MI.getOpcode() == TargetOpcode::G_SUB && + "Expected a G_ADD or G_SUB instruction\n"); + + // Deal with vector types only + LLT DstTy = MRI.getType(DstReg); + if (!DstTy.isVector()) + return false; + + // Return true if G_{S|Z}EXT instruction is more than 2* source + Register ExtDstReg = MI.getOperand(1).getReg(); + LLT Ext1SrcTy = MRI.getType(SrcReg1); + LLT Ext2SrcTy = MRI.getType(SrcReg2); + unsigned ExtDstScal = MRI.getType(ExtDstReg).getScalarSizeInBits(); + unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits(); + if (((Ext1SrcScal == 8 && ExtDstScal == 32) || + ((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) && + Ext1SrcTy == Ext2SrcTy) + return true; + + return false; +} + +void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool isSExt, Register DstReg, + Register SrcReg1, Register SrcReg2) { + LLT SrcTy = MRI.getType(SrcReg1); + LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2); + unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; + Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {SrcReg1}).getReg(0); + Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {SrcReg2}).getReg(0); + Register AddReg = + B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0); + + // G_SUB has to sign-extend the result. + // G_ADD needs to sext from sext and can sext or zext from zext, so the + // original opcode is used. + if (MI.getOpcode() == TargetOpcode::G_ADD) + B.buildInstr(Opc, {DstReg}, {AddReg}); + else + B.buildSExt(DstReg, AddReg); + + MI.eraseFromParent(); +} + bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B, CombinerHelper &Helper, GISelChangeObserver &Observer) { // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir index 78411f34bebd31..a0142afd067770 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir @@ -219,10 +219,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1 - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>) - ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[SEXT]], [[SEXT1]] - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[SEXT]], [[SEXT1]] + ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[ADD]](<8 x s16>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>) ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>) ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 @@ -249,10 +250,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1 - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>) - ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[ZEXT]], [[ZEXT1]] - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[ZEXT]], [[ZEXT1]] + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[ADD]](<8 x s16>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ZEXT2]](<8 x s32>) ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>) ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 @@ -279,10 +281,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1 - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>) - ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>) - ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[SEXT]], [[SEXT1]] - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[SEXT]], [[SEXT1]] + ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>) ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>) ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 @@ -309,10 +312,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1 - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>) - ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>) - ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[ZEXT]], [[ZEXT1]] - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[ZEXT]], [[ZEXT1]] + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT]](<8 x s32>) ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>) ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll index 94b792b887eb47..def4192b0e005d 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -94,18 +94,19 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias ; ; GISEL-LABEL: oversized_ADDV_256: ; GISEL: // %bb.0: // %entry -; GISEL-NEXT: ldr d0, [x0] -; GISEL-NEXT: ldr d1, [x1] -; GISEL-NEXT: ushll v0.8h, v0.8b, #0 -; GISEL-NEXT: ushll v1.8h, v1.8b, #0 -; GISEL-NEXT: usubl v2.4s, v0.4h, v1.4h -; GISEL-NEXT: usubl2 v0.4s, v0.8h, v1.8h -; GISEL-NEXT: cmlt v1.4s, v2.4s, #0 -; GISEL-NEXT: cmlt v3.4s, v0.4s, #0 -; GISEL-NEXT: neg v4.4s, v2.4s -; GISEL-NEXT: neg v5.4s, v0.4s -; GISEL-NEXT: bsl v1.16b, v4.16b, v2.16b -; GISEL-NEXT: bit v0.16b, v5.16b, v3.16b +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: movi v0.2d, #0000000000000000 +; GISEL-NEXT: usubl v1.8h, v1.8b, v2.8b +; GISEL-NEXT: sshll v2.4s, v1.4h, #0 +; GISEL-NEXT: sshll2 v3.4s, v1.8h, #0 +; GISEL-NEXT: ssubw2 v0.4s, v0.4s, v1.8h +; GISEL-NEXT: cmlt v4.4s, v2.4s, #0 +; GISEL-NEXT: cmlt v5.4s, v3.4s, #0 +; GISEL-NEXT: neg v6.4s, v2.4s +; GISEL-NEXT: mov v1.16b, v4.16b +; GISEL-NEXT: bif v0.16b, v3.16b, v5.16b +; GISEL-NEXT: bsl v1.16b, v6.16b, v2.16b ; GISEL-NEXT: add v0.4s, v1.4s, v0.4s ; GISEL-NEXT: addv s0, v0.4s ; GISEL-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll index f7d31a214563bc..178c229d04e471 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -289,26 +289,27 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; ; CHECK-GI-LABEL: uabd16b_rdx_i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ushll.8h v2, v0, #0 -; CHECK-GI-NEXT: ushll.8h v3, v1, #0 -; CHECK-GI-NEXT: ushll2.8h v0, v0, #0 -; CHECK-GI-NEXT: ushll2.8h v1, v1, #0 -; CHECK-GI-NEXT: usubl.4s v4, v2, v3 -; CHECK-GI-NEXT: usubl2.4s v2, v2, v3 -; CHECK-GI-NEXT: usubl.4s v3, v0, v1 -; CHECK-GI-NEXT: usubl2.4s v0, v0, v1 -; CHECK-GI-NEXT: cmlt.4s v1, v4, #0 -; CHECK-GI-NEXT: cmlt.4s v5, v2, #0 -; CHECK-GI-NEXT: neg.4s v16, v4 -; CHECK-GI-NEXT: cmlt.4s v6, v3, #0 -; CHECK-GI-NEXT: cmlt.4s v7, v0, #0 -; CHECK-GI-NEXT: neg.4s v17, v2 -; CHECK-GI-NEXT: neg.4s v18, v3 -; CHECK-GI-NEXT: neg.4s v19, v0 -; CHECK-GI-NEXT: bsl.16b v1, v16, v4 -; CHECK-GI-NEXT: bit.16b v2, v17, v5 -; CHECK-GI-NEXT: bit.16b v3, v18, v6 -; CHECK-GI-NEXT: bit.16b v0, v19, v7 +; CHECK-GI-NEXT: usubl.8h v3, v0, v1 +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 +; CHECK-GI-NEXT: usubl2.8h v0, v0, v1 +; CHECK-GI-NEXT: sshll.4s v1, v3, #0 +; CHECK-GI-NEXT: sshll2.4s v4, v3, #0 +; CHECK-GI-NEXT: sshll.4s v5, v0, #0 +; CHECK-GI-NEXT: sshll2.4s v6, v0, #0 +; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3 +; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0 +; CHECK-GI-NEXT: cmlt.4s v2, v1, #0 +; CHECK-GI-NEXT: cmlt.4s v7, v4, #0 +; CHECK-GI-NEXT: neg.4s v16, v1 +; CHECK-GI-NEXT: cmlt.4s v17, v5, #0 +; CHECK-GI-NEXT: cmlt.4s v18, v6, #0 +; CHECK-GI-NEXT: neg.4s v19, v5 +; CHECK-GI-NEXT: bit.16b v1, v16, v2 +; CHECK-GI-NEXT: mov.16b v2, v7 +; CHECK-GI-NEXT: bif.16b v0, v6, v18 +; CHECK-GI-NEXT: bsl.16b v2, v3, v4 +; CHECK-GI-NEXT: mov.16b v3, v17 +; CHECK-GI-NEXT: bsl.16b v3, v19, v5 ; CHECK-GI-NEXT: add.4s v1, v1, v2 ; CHECK-GI-NEXT: add.4s v0, v3, v0 ; CHECK-GI-NEXT: add.4s v0, v1, v0 @@ -336,26 +337,27 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; ; CHECK-GI-LABEL: sabd16b_rdx_i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshll.8h v2, v0, #0 -; CHECK-GI-NEXT: sshll.8h v3, v1, #0 -; CHECK-GI-NEXT: sshll2.8h v0, v0, #0 -; CHECK-GI-NEXT: sshll2.8h v1, v1, #0 -; CHECK-GI-NEXT: ssubl.4s v4, v2, v3 -; CHECK-GI-NEXT: ssubl2.4s v2, v2, v3 -; CHECK-GI-NEXT: ssubl.4s v3, v0, v1 -; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1 -; CHECK-GI-NEXT: cmlt.4s v1, v4, #0 -; CHECK-GI-NEXT: cmlt.4s v5, v2, #0 -; CHECK-GI-NEXT: neg.4s v16, v4 -; CHECK-GI-NEXT: cmlt.4s v6, v3, #0 -; CHECK-GI-NEXT: cmlt.4s v7, v0, #0 -; CHECK-GI-NEXT: neg.4s v17, v2 -; CHECK-GI-NEXT: neg.4s v18, v3 -; CHECK-GI-NEXT: neg.4s v19, v0 -; CHECK-GI-NEXT: bsl.16b v1, v16, v4 -; CHECK-GI-NEXT: bit.16b v2, v17, v5 -; CHECK-GI-NEXT: bit.16b v3, v18, v6 -; CHECK-GI-NEXT: bit.16b v0, v19, v7 +; CHECK-GI-NEXT: ssubl.8h v3, v0, v1 +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 +; CHECK-GI-NEXT: ssubl2.8h v0, v0, v1 +; CHECK-GI-NEXT: sshll.4s v1, v3, #0 +; CHECK-GI-NEXT: sshll2.4s v4, v3, #0 +; CHECK-GI-NEXT: sshll.4s v5, v0, #0 +; CHECK-GI-NEXT: sshll2.4s v6, v0, #0 +; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3 +; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0 +; CHECK-GI-NEXT: cmlt.4s v2, v1, #0 +; CHECK-GI-NEXT: cmlt.4s v7, v4, #0 +; CHECK-GI-NEXT: neg.4s v16, v1 +; CHECK-GI-NEXT: cmlt.4s v17, v5, #0 +; CHECK-GI-NEXT: cmlt.4s v18, v6, #0 +; CHECK-GI-NEXT: neg.4s v19, v5 +; CHECK-GI-NEXT: bit.16b v1, v16, v2 +; CHECK-GI-NEXT: mov.16b v2, v7 +; CHECK-GI-NEXT: bif.16b v0, v6, v18 +; CHECK-GI-NEXT: bsl.16b v2, v3, v4 +; CHECK-GI-NEXT: mov.16b v3, v17 +; CHECK-GI-NEXT: bsl.16b v3, v19, v5 ; CHECK-GI-NEXT: add.4s v1, v1, v2 ; CHECK-GI-NEXT: add.4s v0, v3, v0 ; CHECK-GI-NEXT: add.4s v0, v1, v0 diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 6aa9c394a8fd1f..402682c89124bd 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -134,10 +134,9 @@ define <8 x i32> @extadds_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { ; ; CHECK-GI-LABEL: extadds_v8i8_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: saddl v0.4s, v2.4h, v1.4h -; CHECK-GI-NEXT: saddl2 v1.4s, v2.8h, v1.8h +; CHECK-GI-NEXT: saddl v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i32> @@ -156,10 +155,9 @@ define <8 x i32> @extaddu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { ; ; CHECK-GI-LABEL: extaddu_v8i8_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: uaddl v0.4s, v2.4h, v1.4h -; CHECK-GI-NEXT: uaddl2 v1.4s, v2.8h, v1.8h +; CHECK-GI-NEXT: uaddl v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: ushll v0.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i8> %s0 to <8 x i32> @@ -178,10 +176,9 @@ define <8 x i32> @extsubs_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { ; ; CHECK-GI-LABEL: extsubs_v8i8_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: ssubl v0.4s, v2.4h, v1.4h -; CHECK-GI-NEXT: ssubl2 v1.4s, v2.8h, v1.8h +; CHECK-GI-NEXT: ssubl v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i32> @@ -200,10 +197,9 @@ define <8 x i32> @extsubu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { ; ; CHECK-GI-LABEL: extsubu_v8i8_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: usubl v0.4s, v2.4h, v1.4h -; CHECK-GI-NEXT: usubl2 v1.4s, v2.8h, v1.8h +; CHECK-GI-NEXT: usubl v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i8> %s0 to <8 x i32> @@ -225,14 +221,12 @@ define <16 x i32> @extadds_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { ; ; CHECK-GI-LABEL: extadds_v16i8_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll2 v4.8h, v0.16b, #0 -; CHECK-GI-NEXT: sshll2 v5.8h, v1.16b, #0 -; CHECK-GI-NEXT: saddl v0.4s, v2.4h, v3.4h -; CHECK-GI-NEXT: saddl2 v1.4s, v2.8h, v3.8h -; CHECK-GI-NEXT: saddl v2.4s, v4.4h, v5.4h -; CHECK-GI-NEXT: saddl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: saddl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: saddl2 v3.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <16 x i8> %s0 to <16 x i32> @@ -254,14 +248,12 @@ define <16 x i32> @extaddu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { ; ; CHECK-GI-LABEL: extaddu_v16i8_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll2 v4.8h, v0.16b, #0 -; CHECK-GI-NEXT: ushll2 v5.8h, v1.16b, #0 -; CHECK-GI-NEXT: uaddl v0.4s, v2.4h, v3.4h -; CHECK-GI-NEXT: uaddl2 v1.4s, v2.8h, v3.8h -; CHECK-GI-NEXT: uaddl v2.4s, v4.4h, v5.4h -; CHECK-GI-NEXT: uaddl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: uaddl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uaddl2 v3.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: ushll v0.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> @@ -283,14 +275,12 @@ define <16 x i32> @extsubs_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { ; ; CHECK-GI-LABEL: extsubs_v16i8_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll2 v4.8h, v0.16b, #0 -; CHECK-GI-NEXT: sshll2 v5.8h, v1.16b, #0 -; CHECK-GI-NEXT: ssubl v0.4s, v2.4h, v3.4h -; CHECK-GI-NEXT: ssubl2 v1.4s, v2.8h, v3.8h -; CHECK-GI-NEXT: ssubl v2.4s, v4.4h, v5.4h -; CHECK-GI-NEXT: ssubl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ssubl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: ssubl2 v3.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <16 x i8> %s0 to <16 x i32> @@ -312,14 +302,12 @@ define <16 x i32> @extsubu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { ; ; CHECK-GI-LABEL: extsubu_v16i8_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll2 v4.8h, v0.16b, #0 -; CHECK-GI-NEXT: ushll2 v5.8h, v1.16b, #0 -; CHECK-GI-NEXT: usubl v0.4s, v2.4h, v3.4h -; CHECK-GI-NEXT: usubl2 v1.4s, v2.8h, v3.8h -; CHECK-GI-NEXT: usubl v2.4s, v4.4h, v5.4h -; CHECK-GI-NEXT: usubl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: usubl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: usubl2 v3.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> @@ -342,16 +330,13 @@ define <8 x i64> @extadds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { ; ; CHECK-GI-LABEL: extadds_v8i8_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: saddl v0.2d, v2.2s, v3.2s -; CHECK-GI-NEXT: saddl2 v1.2d, v2.4s, v3.4s -; CHECK-GI-NEXT: saddl v2.2d, v4.2s, v5.2s -; CHECK-GI-NEXT: saddl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: saddl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i64> @@ -374,16 +359,13 @@ define <8 x i64> @extaddu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { ; ; CHECK-GI-LABEL: extaddu_v8i8_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: uaddl v0.2d, v2.2s, v3.2s -; CHECK-GI-NEXT: uaddl2 v1.2d, v2.4s, v3.4s -; CHECK-GI-NEXT: uaddl v2.2d, v4.2s, v5.2s -; CHECK-GI-NEXT: uaddl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0 +; CHECK-GI-NEXT: ushll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i8> %s0 to <8 x i64> @@ -406,16 +388,13 @@ define <8 x i64> @extsubs_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { ; ; CHECK-GI-LABEL: extsubs_v8i8_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: ssubl v0.2d, v2.2s, v3.2s -; CHECK-GI-NEXT: ssubl2 v1.2d, v2.4s, v3.4s -; CHECK-GI-NEXT: ssubl v2.2d, v4.2s, v5.2s -; CHECK-GI-NEXT: ssubl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ssubl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i64> @@ -438,16 +417,13 @@ define <8 x i64> @extsubu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { ; ; CHECK-GI-LABEL: extsubu_v8i8_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: usubl v0.2d, v2.2s, v3.2s -; CHECK-GI-NEXT: usubl2 v1.2d, v2.4s, v3.4s -; CHECK-GI-NEXT: usubl v2.2d, v4.2s, v5.2s -; CHECK-GI-NEXT: usubl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i8> %s0 to <8 x i64> @@ -477,26 +453,20 @@ define <16 x i64> @extaddu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) { ; ; CHECK-GI-LABEL: extaddu_v16i8_i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v2.8h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll v6.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v16.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v17.4s, v1.8h, #0 -; CHECK-GI-NEXT: uaddl v0.2d, v4.2s, v2.2s -; CHECK-GI-NEXT: uaddl2 v1.2d, v4.4s, v2.4s -; CHECK-GI-NEXT: uaddl v2.2d, v5.2s, v3.2s -; CHECK-GI-NEXT: uaddl2 v3.2d, v5.4s, v3.4s -; CHECK-GI-NEXT: uaddl v4.2d, v6.2s, v7.2s -; CHECK-GI-NEXT: uaddl2 v5.2d, v6.4s, v7.4s -; CHECK-GI-NEXT: uaddl v6.2d, v16.2s, v17.2s -; CHECK-GI-NEXT: uaddl2 v7.2d, v16.4s, v17.4s +; CHECK-GI-NEXT: uaddl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uaddl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll2 v3.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll v5.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v7.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0 +; CHECK-GI-NEXT: ushll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0 +; CHECK-GI-NEXT: ushll v4.2d, v5.2s, #0 +; CHECK-GI-NEXT: ushll2 v5.2d, v5.4s, #0 +; CHECK-GI-NEXT: ushll v6.2d, v7.2s, #0 +; CHECK-GI-NEXT: ushll2 v7.2d, v7.4s, #0 ; CHECK-GI-NEXT: ret %c = zext <16 x i8> %a to <16 x i64> %d = zext <16 x i8> %b to <16 x i64> @@ -525,26 +495,20 @@ define <16 x i64> @extadds_v16i8_i64(<16 x i8> %a, <16 x i8> %b) { ; ; CHECK-GI-LABEL: extadds_v16i8_i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 -; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll v6.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v17.4s, v1.8h, #0 -; CHECK-GI-NEXT: saddl v0.2d, v4.2s, v2.2s -; CHECK-GI-NEXT: saddl2 v1.2d, v4.4s, v2.4s -; CHECK-GI-NEXT: saddl v2.2d, v5.2s, v3.2s -; CHECK-GI-NEXT: saddl2 v3.2d, v5.4s, v3.4s -; CHECK-GI-NEXT: saddl v4.2d, v6.2s, v7.2s -; CHECK-GI-NEXT: saddl2 v5.2d, v6.4s, v7.4s -; CHECK-GI-NEXT: saddl v6.2d, v16.2s, v17.2s -; CHECK-GI-NEXT: saddl2 v7.2d, v16.4s, v17.4s +; CHECK-GI-NEXT: saddl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: saddl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v7.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0 +; CHECK-GI-NEXT: sshll v4.2d, v5.2s, #0 +; CHECK-GI-NEXT: sshll2 v5.2d, v5.4s, #0 +; CHECK-GI-NEXT: sshll v6.2d, v7.2s, #0 +; CHECK-GI-NEXT: sshll2 v7.2d, v7.4s, #0 ; CHECK-GI-NEXT: ret %c = sext <16 x i8> %a to <16 x i64> %d = sext <16 x i8> %b to <16 x i64> @@ -573,26 +537,20 @@ define <16 x i64> @extsubu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) { ; ; CHECK-GI-LABEL: extsubu_v16i8_i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v2.8h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll v6.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v16.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v17.4s, v1.8h, #0 -; CHECK-GI-NEXT: usubl v0.2d, v4.2s, v2.2s -; CHECK-GI-NEXT: usubl2 v1.2d, v4.4s, v2.4s -; CHECK-GI-NEXT: usubl v2.2d, v5.2s, v3.2s -; CHECK-GI-NEXT: usubl2 v3.2d, v5.4s, v3.4s -; CHECK-GI-NEXT: usubl v4.2d, v6.2s, v7.2s -; CHECK-GI-NEXT: usubl2 v5.2d, v6.4s, v7.4s -; CHECK-GI-NEXT: usubl v6.2d, v16.2s, v17.2s -; CHECK-GI-NEXT: usubl2 v7.2d, v16.4s, v17.4s +; CHECK-GI-NEXT: usubl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: usubl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v7.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0 +; CHECK-GI-NEXT: sshll v4.2d, v5.2s, #0 +; CHECK-GI-NEXT: sshll2 v5.2d, v5.4s, #0 +; CHECK-GI-NEXT: sshll v6.2d, v7.2s, #0 +; CHECK-GI-NEXT: sshll2 v7.2d, v7.4s, #0 ; CHECK-GI-NEXT: ret %c = zext <16 x i8> %a to <16 x i64> %d = zext <16 x i8> %b to <16 x i64> @@ -621,26 +579,20 @@ define <16 x i64> @extsubs_v16i8_i64(<16 x i8> %a, <16 x i8> %b) { ; ; CHECK-GI-LABEL: extsubs_v16i8_i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 -; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll v6.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v17.4s, v1.8h, #0 -; CHECK-GI-NEXT: ssubl v0.2d, v4.2s, v2.2s -; CHECK-GI-NEXT: ssubl2 v1.2d, v4.4s, v2.4s -; CHECK-GI-NEXT: ssubl v2.2d, v5.2s, v3.2s -; CHECK-GI-NEXT: ssubl2 v3.2d, v5.4s, v3.4s -; CHECK-GI-NEXT: ssubl v4.2d, v6.2s, v7.2s -; CHECK-GI-NEXT: ssubl2 v5.2d, v6.4s, v7.4s -; CHECK-GI-NEXT: ssubl v6.2d, v16.2s, v17.2s -; CHECK-GI-NEXT: ssubl2 v7.2d, v16.4s, v17.4s +; CHECK-GI-NEXT: ssubl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: ssubl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v7.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0 +; CHECK-GI-NEXT: sshll v4.2d, v5.2s, #0 +; CHECK-GI-NEXT: sshll2 v5.2d, v5.4s, #0 +; CHECK-GI-NEXT: sshll v6.2d, v7.2s, #0 +; CHECK-GI-NEXT: sshll2 v7.2d, v7.4s, #0 ; CHECK-GI-NEXT: ret %c = sext <16 x i8> %a to <16 x i64> %d = sext <16 x i8> %b to <16 x i64> @@ -667,22 +619,18 @@ define <16 x i64> @extaddu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) { ; ; CHECK-GI-LABEL: extaddu_v16i16_i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v16.4s, v2.8h, #0 -; CHECK-GI-NEXT: ushll v17.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll2 v18.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll2 v19.4s, v3.8h, #0 -; CHECK-GI-NEXT: uaddl v0.2d, v4.2s, v6.2s -; CHECK-GI-NEXT: uaddl2 v1.2d, v4.4s, v6.4s -; CHECK-GI-NEXT: uaddl v2.2d, v5.2s, v16.2s -; CHECK-GI-NEXT: uaddl2 v3.2d, v5.4s, v16.4s -; CHECK-GI-NEXT: uaddl v4.2d, v7.2s, v17.2s -; CHECK-GI-NEXT: uaddl2 v5.2d, v7.4s, v17.4s -; CHECK-GI-NEXT: uaddl v6.2d, v18.2s, v19.2s -; CHECK-GI-NEXT: uaddl2 v7.2d, v18.4s, v19.4s +; CHECK-GI-NEXT: uaddl v4.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: uaddl2 v5.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: uaddl v6.4s, v1.4h, v3.4h +; CHECK-GI-NEXT: uaddl2 v7.4s, v1.8h, v3.8h +; CHECK-GI-NEXT: ushll v0.2d, v4.2s, #0 +; CHECK-GI-NEXT: ushll2 v1.2d, v4.4s, #0 +; CHECK-GI-NEXT: ushll v2.2d, v5.2s, #0 +; CHECK-GI-NEXT: ushll2 v3.2d, v5.4s, #0 +; CHECK-GI-NEXT: ushll v4.2d, v6.2s, #0 +; CHECK-GI-NEXT: ushll2 v5.2d, v6.4s, #0 +; CHECK-GI-NEXT: ushll v6.2d, v7.2s, #0 +; CHECK-GI-NEXT: ushll2 v7.2d, v7.4s, #0 ; CHECK-GI-NEXT: ret %c = zext <16 x i16> %a to <16 x i64> %d = zext <16 x i16> %b to <16 x i64> @@ -709,22 +657,18 @@ define <16 x i64> @extadds_v16i16_i64(<16 x i16> %a, <16 x i16> %b) { ; ; CHECK-GI-LABEL: extadds_v16i16_i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll v6.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll v17.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v18.4s, v1.8h, #0 -; CHECK-GI-NEXT: sshll2 v19.4s, v3.8h, #0 -; CHECK-GI-NEXT: saddl v0.2d, v4.2s, v6.2s -; CHECK-GI-NEXT: saddl2 v1.2d, v4.4s, v6.4s -; CHECK-GI-NEXT: saddl v2.2d, v5.2s, v16.2s -; CHECK-GI-NEXT: saddl2 v3.2d, v5.4s, v16.4s -; CHECK-GI-NEXT: saddl v4.2d, v7.2s, v17.2s -; CHECK-GI-NEXT: saddl2 v5.2d, v7.4s, v17.4s -; CHECK-GI-NEXT: saddl v6.2d, v18.2s, v19.2s -; CHECK-GI-NEXT: saddl2 v7.2d, v18.4s, v19.4s +; CHECK-GI-NEXT: saddl v4.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: saddl2 v5.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: saddl v6.4s, v1.4h, v3.4h +; CHECK-GI-NEXT: saddl2 v7.4s, v1.8h, v3.8h +; CHECK-GI-NEXT: sshll v0.2d, v4.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v4.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v5.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v5.4s, #0 +; CHECK-GI-NEXT: sshll v4.2d, v6.2s, #0 +; CHECK-GI-NEXT: sshll2 v5.2d, v6.4s, #0 +; CHECK-GI-NEXT: sshll v6.2d, v7.2s, #0 +; CHECK-GI-NEXT: sshll2 v7.2d, v7.4s, #0 ; CHECK-GI-NEXT: ret %c = sext <16 x i16> %a to <16 x i64> %d = sext <16 x i16> %b to <16 x i64> @@ -751,22 +695,18 @@ define <16 x i64> @extsubu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) { ; ; CHECK-GI-LABEL: extsubu_v16i16_i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v16.4s, v2.8h, #0 -; CHECK-GI-NEXT: ushll v17.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll2 v18.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll2 v19.4s, v3.8h, #0 -; CHECK-GI-NEXT: usubl v0.2d, v4.2s, v6.2s -; CHECK-GI-NEXT: usubl2 v1.2d, v4.4s, v6.4s -; CHECK-GI-NEXT: usubl v2.2d, v5.2s, v16.2s -; CHECK-GI-NEXT: usubl2 v3.2d, v5.4s, v16.4s -; CHECK-GI-NEXT: usubl v4.2d, v7.2s, v17.2s -; CHECK-GI-NEXT: usubl2 v5.2d, v7.4s, v17.4s -; CHECK-GI-NEXT: usubl v6.2d, v18.2s, v19.2s -; CHECK-GI-NEXT: usubl2 v7.2d, v18.4s, v19.4s +; CHECK-GI-NEXT: usubl v4.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: usubl2 v5.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: usubl v6.4s, v1.4h, v3.4h +; CHECK-GI-NEXT: usubl2 v7.4s, v1.8h, v3.8h +; CHECK-GI-NEXT: sshll v0.2d, v4.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v4.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v5.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v5.4s, #0 +; CHECK-GI-NEXT: sshll v4.2d, v6.2s, #0 +; CHECK-GI-NEXT: sshll2 v5.2d, v6.4s, #0 +; CHECK-GI-NEXT: sshll v6.2d, v7.2s, #0 +; CHECK-GI-NEXT: sshll2 v7.2d, v7.4s, #0 ; CHECK-GI-NEXT: ret %c = zext <16 x i16> %a to <16 x i64> %d = zext <16 x i16> %b to <16 x i64> @@ -793,22 +733,18 @@ define <16 x i64> @extsubs_v16i16_i64(<16 x i16> %a, <16 x i16> %b) { ; ; CHECK-GI-LABEL: extsubs_v16i16_i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll v6.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll v17.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v18.4s, v1.8h, #0 -; CHECK-GI-NEXT: sshll2 v19.4s, v3.8h, #0 -; CHECK-GI-NEXT: ssubl v0.2d, v4.2s, v6.2s -; CHECK-GI-NEXT: ssubl2 v1.2d, v4.4s, v6.4s -; CHECK-GI-NEXT: ssubl v2.2d, v5.2s, v16.2s -; CHECK-GI-NEXT: ssubl2 v3.2d, v5.4s, v16.4s -; CHECK-GI-NEXT: ssubl v4.2d, v7.2s, v17.2s -; CHECK-GI-NEXT: ssubl2 v5.2d, v7.4s, v17.4s -; CHECK-GI-NEXT: ssubl v6.2d, v18.2s, v19.2s -; CHECK-GI-NEXT: ssubl2 v7.2d, v18.4s, v19.4s +; CHECK-GI-NEXT: ssubl v4.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: ssubl2 v5.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: ssubl v6.4s, v1.4h, v3.4h +; CHECK-GI-NEXT: ssubl2 v7.4s, v1.8h, v3.8h +; CHECK-GI-NEXT: sshll v0.2d, v4.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v4.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v5.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v5.4s, #0 +; CHECK-GI-NEXT: sshll v4.2d, v6.2s, #0 +; CHECK-GI-NEXT: sshll2 v5.2d, v6.4s, #0 +; CHECK-GI-NEXT: sshll v6.2d, v7.2s, #0 +; CHECK-GI-NEXT: sshll2 v7.2d, v7.4s, #0 ; CHECK-GI-NEXT: ret %c = sext <16 x i16> %a to <16 x i64> %d = sext <16 x i16> %b to <16 x i64> @@ -948,10 +884,9 @@ define <4 x i64> @extadds_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) { ; ; CHECK-GI-LABEL: extadds_v4i16_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: saddl v0.2d, v2.2s, v1.2s -; CHECK-GI-NEXT: saddl2 v1.2d, v2.4s, v1.4s +; CHECK-GI-NEXT: saddl v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <4 x i16> %s0 to <4 x i64> @@ -970,10 +905,9 @@ define <4 x i64> @extaddu_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) { ; ; CHECK-GI-LABEL: extaddu_v4i16_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: uaddl v0.2d, v2.2s, v1.2s -; CHECK-GI-NEXT: uaddl2 v1.2d, v2.4s, v1.4s +; CHECK-GI-NEXT: uaddl v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ushll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <4 x i16> %s0 to <4 x i64> @@ -995,14 +929,12 @@ define <8 x i64> @extadds_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { ; ; CHECK-GI-LABEL: extadds_v8i16_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: saddl v0.2d, v2.2s, v3.2s -; CHECK-GI-NEXT: saddl2 v1.2d, v2.4s, v3.4s -; CHECK-GI-NEXT: saddl v2.2d, v4.2s, v5.2s -; CHECK-GI-NEXT: saddl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: saddl v2.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: saddl2 v3.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: sshll v0.2d, v2.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v2.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i16> %s0 to <8 x i64> @@ -1024,14 +956,12 @@ define <8 x i64> @extaddu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { ; ; CHECK-GI-LABEL: extaddu_v8i16_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: uaddl v0.2d, v2.2s, v3.2s -; CHECK-GI-NEXT: uaddl2 v1.2d, v2.4s, v3.4s -; CHECK-GI-NEXT: uaddl v2.2d, v4.2s, v5.2s -; CHECK-GI-NEXT: uaddl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: uaddl v2.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uaddl2 v3.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: ushll v0.2d, v2.2s, #0 +; CHECK-GI-NEXT: ushll2 v1.2d, v2.4s, #0 +; CHECK-GI-NEXT: ushll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i16> %s0 to <8 x i64> @@ -1053,14 +983,12 @@ define <8 x i64> @extsubs_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { ; ; CHECK-GI-LABEL: extsubs_v8i16_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: ssubl v0.2d, v2.2s, v3.2s -; CHECK-GI-NEXT: ssubl2 v1.2d, v2.4s, v3.4s -; CHECK-GI-NEXT: ssubl v2.2d, v4.2s, v5.2s -; CHECK-GI-NEXT: ssubl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: ssubl v2.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ssubl2 v3.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: sshll v0.2d, v2.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v2.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i16> %s0 to <8 x i64> @@ -1082,14 +1010,12 @@ define <8 x i64> @extsubu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { ; ; CHECK-GI-LABEL: extsubu_v8i16_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: usubl v0.2d, v2.2s, v3.2s -; CHECK-GI-NEXT: usubl2 v1.2d, v2.4s, v3.4s -; CHECK-GI-NEXT: usubl v2.2d, v4.2s, v5.2s -; CHECK-GI-NEXT: usubl2 v3.2d, v4.4s, v5.4s +; CHECK-GI-NEXT: usubl v2.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: usubl2 v3.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: sshll v0.2d, v2.2s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v2.4s, #0 +; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0 +; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <8 x i16> %s0 to <8 x i64> @@ -1343,86 +1269,92 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; CHECK-GI-NEXT: ldr s0, [sp] ; CHECK-GI-NEXT: ldr s4, [sp, #8] ; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: ldr s18, [sp, #16] ; CHECK-GI-NEXT: ldr s2, [sp, #32] +; CHECK-GI-NEXT: ldr s19, [sp, #40] ; CHECK-GI-NEXT: fmov s3, w4 ; CHECK-GI-NEXT: mov v0.s[1], v4.s[0] -; CHECK-GI-NEXT: ldr s16, [sp, #40] -; CHECK-GI-NEXT: ldr s4, [sp, #64] -; CHECK-GI-NEXT: ldr s19, [sp, #72] -; CHECK-GI-NEXT: ldr s21, [sp, #104] -; CHECK-GI-NEXT: mov v1.s[1], w1 -; CHECK-GI-NEXT: mov v2.s[1], v16.s[0] ; CHECK-GI-NEXT: ldr s16, [sp, #96] -; CHECK-GI-NEXT: ldr s22, [sp, #136] +; CHECK-GI-NEXT: ldr s22, [sp, #104] +; CHECK-GI-NEXT: mov v2.s[1], v19.s[0] +; CHECK-GI-NEXT: ldr s19, [sp, #128] +; CHECK-GI-NEXT: ldr s23, [sp, #136] +; CHECK-GI-NEXT: ldr s18, [sp, #16] +; CHECK-GI-NEXT: mov v1.s[1], w1 ; CHECK-GI-NEXT: mov v3.s[1], w5 -; CHECK-GI-NEXT: ldr s20, [sp, #48] -; CHECK-GI-NEXT: mov v4.s[1], v19.s[0] +; CHECK-GI-NEXT: mov v16.s[1], v22.s[0] +; CHECK-GI-NEXT: mov v19.s[1], v23.s[0] +; CHECK-GI-NEXT: ldr s4, [sp, #64] +; CHECK-GI-NEXT: ldr s21, [sp, #72] ; CHECK-GI-NEXT: mov v0.s[2], v18.s[0] -; CHECK-GI-NEXT: ldr s18, [sp, #128] -; CHECK-GI-NEXT: ldr s19, [sp, #160] +; CHECK-GI-NEXT: ldr s18, [sp, #160] ; CHECK-GI-NEXT: ldr s24, [sp, #168] -; CHECK-GI-NEXT: mov v16.s[1], v21.s[0] -; CHECK-GI-NEXT: ldr s21, [sp, #192] -; CHECK-GI-NEXT: mov v18.s[1], v22.s[0] +; CHECK-GI-NEXT: ldr s20, [sp, #192] ; CHECK-GI-NEXT: ldr s25, [sp, #200] ; CHECK-GI-NEXT: ldr s22, [sp, #224] -; CHECK-GI-NEXT: ldr s26, [sp, #232] +; CHECK-GI-NEXT: ldr s27, [sp, #232] ; CHECK-GI-NEXT: ldr s23, [sp, #112] -; CHECK-GI-NEXT: mov v19.s[1], v24.s[0] -; CHECK-GI-NEXT: mov v2.s[2], v20.s[0] -; CHECK-GI-NEXT: ldr s20, [sp, #144] -; CHECK-GI-NEXT: ldr s17, [sp, #80] -; CHECK-GI-NEXT: mov v21.s[1], v25.s[0] -; CHECK-GI-NEXT: mov v22.s[1], v26.s[0] +; CHECK-GI-NEXT: ldr s26, [sp, #144] +; CHECK-GI-NEXT: mov v18.s[1], v24.s[0] +; CHECK-GI-NEXT: mov v20.s[1], v25.s[0] +; CHECK-GI-NEXT: mov v4.s[1], v21.s[0] +; CHECK-GI-NEXT: mov v22.s[1], v27.s[0] ; CHECK-GI-NEXT: mov v1.s[2], w2 +; CHECK-GI-NEXT: ldr s17, [sp, #48] ; CHECK-GI-NEXT: mov v3.s[2], w6 -; CHECK-GI-NEXT: ldr s24, [sp, #176] ; CHECK-GI-NEXT: mov v16.s[2], v23.s[0] -; CHECK-GI-NEXT: mov v18.s[2], v20.s[0] -; CHECK-GI-NEXT: mov v4.s[2], v17.s[0] -; CHECK-GI-NEXT: ldr s17, [sp, #208] -; CHECK-GI-NEXT: ldr s23, [sp, #240] -; CHECK-GI-NEXT: ldr s20, [sp, #120] -; CHECK-GI-NEXT: mov v19.s[2], v24.s[0] -; CHECK-GI-NEXT: ldr s24, [sp, #152] +; CHECK-GI-NEXT: mov v19.s[2], v26.s[0] +; CHECK-GI-NEXT: ldr s7, [sp, #80] +; CHECK-GI-NEXT: ldr s21, [sp, #176] +; CHECK-GI-NEXT: ldr s24, [sp, #208] +; CHECK-GI-NEXT: ldr s25, [sp, #240] +; CHECK-GI-NEXT: mov v2.s[2], v17.s[0] +; CHECK-GI-NEXT: ldr s17, [sp, #120] +; CHECK-GI-NEXT: ldr s23, [sp, #152] ; CHECK-GI-NEXT: ldr s5, [sp, #24] -; CHECK-GI-NEXT: mov v21.s[2], v17.s[0] -; CHECK-GI-NEXT: mov v22.s[2], v23.s[0] +; CHECK-GI-NEXT: mov v18.s[2], v21.s[0] +; CHECK-GI-NEXT: mov v20.s[2], v24.s[0] +; CHECK-GI-NEXT: mov v4.s[2], v7.s[0] +; CHECK-GI-NEXT: mov v22.s[2], v25.s[0] ; CHECK-GI-NEXT: mov v1.s[3], w3 -; CHECK-GI-NEXT: mov v16.s[3], v20.s[0] -; CHECK-GI-NEXT: movi v17.2d, #0x0000ff000000ff ; CHECK-GI-NEXT: mov v3.s[3], w7 -; CHECK-GI-NEXT: mov v18.s[3], v24.s[0] +; CHECK-GI-NEXT: mov v16.s[3], v17.s[0] +; CHECK-GI-NEXT: mov v19.s[3], v23.s[0] ; CHECK-GI-NEXT: ldr s6, [sp, #56] -; CHECK-GI-NEXT: ldr s7, [sp, #88] -; CHECK-GI-NEXT: ldr s25, [sp, #184] -; CHECK-GI-NEXT: ldr s20, [sp, #216] +; CHECK-GI-NEXT: ldr s7, [sp, #184] +; CHECK-GI-NEXT: ldr s21, [sp, #216] +; CHECK-GI-NEXT: ldr s17, [sp, #88] ; CHECK-GI-NEXT: mov v0.s[3], v5.s[0] ; CHECK-GI-NEXT: ldr s5, [sp, #248] -; CHECK-GI-NEXT: mov v19.s[3], v25.s[0] ; CHECK-GI-NEXT: mov v2.s[3], v6.s[0] -; CHECK-GI-NEXT: mov v4.s[3], v7.s[0] -; CHECK-GI-NEXT: mov v21.s[3], v20.s[0] +; CHECK-GI-NEXT: mov v18.s[3], v7.s[0] +; CHECK-GI-NEXT: mov v20.s[3], v21.s[0] +; CHECK-GI-NEXT: mov v4.s[3], v17.s[0] ; CHECK-GI-NEXT: mov v22.s[3], v5.s[0] -; CHECK-GI-NEXT: and v1.16b, v1.16b, v17.16b -; CHECK-GI-NEXT: and v5.16b, v16.16b, v17.16b -; CHECK-GI-NEXT: and v3.16b, v3.16b, v17.16b -; CHECK-GI-NEXT: and v6.16b, v18.16b, v17.16b -; CHECK-GI-NEXT: and v0.16b, v0.16b, v17.16b -; CHECK-GI-NEXT: and v7.16b, v19.16b, v17.16b -; CHECK-GI-NEXT: and v2.16b, v2.16b, v17.16b -; CHECK-GI-NEXT: and v4.16b, v4.16b, v17.16b -; CHECK-GI-NEXT: and v16.16b, v21.16b, v17.16b -; CHECK-GI-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-GI-NEXT: and v5.16b, v22.16b, v17.16b -; CHECK-GI-NEXT: add v3.4s, v3.4s, v6.4s -; CHECK-GI-NEXT: add v0.4s, v0.4s, v7.4s -; CHECK-GI-NEXT: add v2.4s, v2.4s, v16.4s -; CHECK-GI-NEXT: stp q1, q3, [x8] -; CHECK-GI-NEXT: add v1.4s, v4.4s, v5.4s -; CHECK-GI-NEXT: stp q0, q2, [x8, #32] -; CHECK-GI-NEXT: str q1, [x8, #64] +; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uzp1 v5.8h, v16.8h, v19.8h +; CHECK-GI-NEXT: dup v6.4s, w8 +; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: uzp1 v2.8h, v18.8h, v20.8h +; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v6.8h +; CHECK-GI-NEXT: uzp1 v6.8h, v22.8h, v6.8h +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: and v5.16b, v5.16b, v3.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: add v1.8h, v1.8h, v5.8h +; CHECK-GI-NEXT: and v4.16b, v4.16b, v3.16b +; CHECK-GI-NEXT: and v3.16b, v6.16b, v3.16b +; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: ushll v2.4s, v1.4h, #0 +; CHECK-GI-NEXT: add v3.4h, v4.4h, v3.4h +; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 +; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: stp q2, q1, [x8] +; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: stp q4, q0, [x8, #32] +; CHECK-GI-NEXT: str q2, [x8, #64] ; CHECK-GI-NEXT: ret entry: %s0s = zext <20 x i8> %s0 to <20 x i32> @@ -1611,14 +1543,12 @@ define <16 x i32> @sub_zz(<16 x i8> %s0, <16 x i8> %s1) { ; ; CHECK-GI-LABEL: sub_zz: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll2 v4.8h, v0.16b, #0 -; CHECK-GI-NEXT: ushll2 v5.8h, v1.16b, #0 -; CHECK-GI-NEXT: usubl v0.4s, v2.4h, v3.4h -; CHECK-GI-NEXT: usubl2 v1.4s, v2.8h, v3.8h -; CHECK-GI-NEXT: usubl v2.4s, v4.4h, v5.4h -; CHECK-GI-NEXT: usubl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: usubl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: usubl2 v3.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> @@ -1640,14 +1570,12 @@ define <16 x i32> @sub_ss(<16 x i8> %s0, <16 x i8> %s1) { ; ; CHECK-GI-LABEL: sub_ss: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll2 v4.8h, v0.16b, #0 -; CHECK-GI-NEXT: sshll2 v5.8h, v1.16b, #0 -; CHECK-GI-NEXT: ssubl v0.4s, v2.4h, v3.4h -; CHECK-GI-NEXT: ssubl2 v1.4s, v2.8h, v3.8h -; CHECK-GI-NEXT: ssubl v2.4s, v4.4h, v5.4h -; CHECK-GI-NEXT: ssubl2 v3.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ssubl v2.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: ssubl2 v3.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: ret entry: %s0s = sext <16 x i8> %s0 to <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index ab7cea8dfb7789..c9fe89aec8ad9b 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -4725,94 +4725,102 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w3 killed $w3 def $x3 -; CHECK-GI-NEXT: sxtw x8, w3 ; CHECK-GI-NEXT: sxtw x9, w1 +; CHECK-GI-NEXT: sxtw x8, w3 ; CHECK-GI-NEXT: ldr d0, [x0] ; CHECK-GI-NEXT: ldr d1, [x2] ; CHECK-GI-NEXT: add x10, x0, x9 ; CHECK-GI-NEXT: add x11, x2, x8 -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ldr d2, [x10] -; CHECK-GI-NEXT: add x10, x10, x9 -; CHECK-GI-NEXT: add x12, x11, x8 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: ldr d3, [x11] -; CHECK-GI-NEXT: ldr d4, [x10] -; CHECK-GI-NEXT: ldr d5, [x12] -; CHECK-GI-NEXT: add x10, x10, x9 -; CHECK-GI-NEXT: add x11, x12, x8 -; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-GI-NEXT: uabdl v6.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: uabdl2 v0.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: usubl v0.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: ldr d1, [x10] -; CHECK-GI-NEXT: ldr d7, [x11] +; CHECK-GI-NEXT: ldr d2, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: uabdl v16.4s, v2.4h, v3.4h -; CHECK-GI-NEXT: uabdl2 v2.4s, v2.8h, v3.8h -; CHECK-GI-NEXT: uabdl v3.4s, v4.4h, v5.4h -; CHECK-GI-NEXT: uabdl2 v4.4s, v4.8h, v5.8h -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0 -; CHECK-GI-NEXT: ldr d5, [x10] -; CHECK-GI-NEXT: ldr d17, [x11] +; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b +; CHECK-GI-NEXT: ldr d3, [x10] +; CHECK-GI-NEXT: ldr d4, [x11] +; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: ldr d2, [x10] ; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 +; CHECK-GI-NEXT: ldr d6, [x11] ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-GI-NEXT: ushll v17.8h, v17.8b, #0 -; CHECK-GI-NEXT: add v2.4s, v16.4s, v2.4s -; CHECK-GI-NEXT: add v3.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: uabdl v4.4s, v1.4h, v7.4h -; CHECK-GI-NEXT: uabdl2 v1.4s, v1.8h, v7.8h -; CHECK-GI-NEXT: ldr d7, [x10] +; CHECK-GI-NEXT: usubl v3.8h, v3.8b, v4.8b +; CHECK-GI-NEXT: abs v5.4s, v5.4s +; CHECK-GI-NEXT: abs v0.4s, v0.4s +; CHECK-GI-NEXT: ldr d4, [x10] ; CHECK-GI-NEXT: ldr d16, [x11] +; CHECK-GI-NEXT: abs v7.4s, v7.4s +; CHECK-GI-NEXT: abs v1.4s, v1.4s ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: ldr d18, [x10] -; CHECK-GI-NEXT: ldr d20, [x10, x9] -; CHECK-GI-NEXT: ldr d19, [x11] -; CHECK-GI-NEXT: ldr d21, [x11, x8] -; CHECK-GI-NEXT: uabdl v6.4s, v5.4h, v17.4h -; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0 -; CHECK-GI-NEXT: ushll v16.8h, v16.8b, #0 -; CHECK-GI-NEXT: uabdl2 v5.4s, v5.8h, v17.8h -; CHECK-GI-NEXT: ushll v17.8h, v18.8b, #0 -; CHECK-GI-NEXT: ushll v18.8h, v19.8b, #0 -; CHECK-GI-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-GI-NEXT: ushll v4.8h, v20.8b, #0 -; CHECK-GI-NEXT: ushll v19.8h, v21.8b, #0 -; CHECK-GI-NEXT: addv s2, v2.4s +; CHECK-GI-NEXT: usubl v2.8h, v2.8b, v6.8b +; CHECK-GI-NEXT: ldr d6, [x10] +; CHECK-GI-NEXT: ldr d17, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v16.8b +; CHECK-GI-NEXT: sshll v16.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-GI-NEXT: ldr d5, [x10] +; CHECK-GI-NEXT: ldr d7, [x11] +; CHECK-GI-NEXT: sshll v18.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-GI-NEXT: usubl v6.8h, v6.8b, v17.8b +; CHECK-GI-NEXT: ldr d17, [x11, x8] +; CHECK-GI-NEXT: sshll v19.4s, v4.4h, #0 +; CHECK-GI-NEXT: usubl v5.8h, v5.8b, v7.8b +; CHECK-GI-NEXT: ldr d7, [x10, x9] +; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: abs v16.4s, v16.4s +; CHECK-GI-NEXT: abs v3.4s, v3.4s +; CHECK-GI-NEXT: abs v18.4s, v18.4s +; CHECK-GI-NEXT: abs v2.4s, v2.4s +; CHECK-GI-NEXT: usubl v7.8h, v7.8b, v17.8b +; CHECK-GI-NEXT: sshll v17.4s, v6.4h, #0 +; CHECK-GI-NEXT: sshll2 v6.4s, v6.8h, #0 +; CHECK-GI-NEXT: abs v19.4s, v19.4s +; CHECK-GI-NEXT: abs v4.4s, v4.4s +; CHECK-GI-NEXT: add v3.4s, v16.4s, v3.4s +; CHECK-GI-NEXT: sshll v16.4s, v5.4h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v5.8h, #0 +; CHECK-GI-NEXT: add v2.4s, v18.4s, v2.4s +; CHECK-GI-NEXT: abs v17.4s, v17.4s +; CHECK-GI-NEXT: addv s1, v1.4s +; CHECK-GI-NEXT: abs v6.4s, v6.4s ; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: add v4.4s, v19.4s, v4.4s ; CHECK-GI-NEXT: addv s3, v3.4s -; CHECK-GI-NEXT: uabdl v20.4s, v7.4h, v16.4h -; CHECK-GI-NEXT: uabdl2 v7.4s, v7.8h, v16.8h -; CHECK-GI-NEXT: add v5.4s, v6.4s, v5.4s -; CHECK-GI-NEXT: uabdl v6.4s, v17.4h, v18.4h -; CHECK-GI-NEXT: uabdl2 v16.4s, v17.8h, v18.8h -; CHECK-GI-NEXT: uabdl v17.4s, v4.4h, v19.4h -; CHECK-GI-NEXT: uabdl2 v4.4s, v4.8h, v19.8h -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: addv s1, v1.4s +; CHECK-GI-NEXT: sshll v18.4s, v7.4h, #0 +; CHECK-GI-NEXT: sshll2 v7.4s, v7.8h, #0 +; CHECK-GI-NEXT: abs v16.4s, v16.4s +; CHECK-GI-NEXT: abs v5.4s, v5.4s +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: add v6.4s, v17.4s, v6.4s +; CHECK-GI-NEXT: addv s2, v2.4s ; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: addv s4, v4.4s ; CHECK-GI-NEXT: fmov w10, s3 -; CHECK-GI-NEXT: add v7.4s, v20.4s, v7.4s -; CHECK-GI-NEXT: add v0.4s, v17.4s, v4.4s -; CHECK-GI-NEXT: addv s4, v5.4s -; CHECK-GI-NEXT: add v2.4s, v6.4s, v16.4s +; CHECK-GI-NEXT: abs v18.4s, v18.4s +; CHECK-GI-NEXT: abs v7.4s, v7.4s +; CHECK-GI-NEXT: add v1.4s, v16.4s, v5.4s ; CHECK-GI-NEXT: add w8, w8, w9 -; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: addv s3, v6.4s +; CHECK-GI-NEXT: fmov w9, s2 ; CHECK-GI-NEXT: add w8, w10, w8 -; CHECK-GI-NEXT: addv s3, v7.4s -; CHECK-GI-NEXT: addv s1, v2.4s -; CHECK-GI-NEXT: addv s0, v0.4s -; CHECK-GI-NEXT: add w8, w9, w8 -; CHECK-GI-NEXT: fmov w9, s4 +; CHECK-GI-NEXT: fmov w10, s4 +; CHECK-GI-NEXT: add v0.4s, v18.4s, v7.4s +; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: add w8, w9, w8 ; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: add w8, w10, w8 +; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: add w8, w9, w8 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w8, w9, w8 From 0981dca7779d4acfcbb92fbb29a7a1033e283b88 Mon Sep 17 00:00:00 2001 From: donald chen Date: Wed, 29 May 2024 22:20:49 +0800 Subject: [PATCH 135/230] [mlir][arith] Add neutral element support to arith.maxnumf/arith.minnumf (#93278) For maxnumf and minnumf, the result of calculations involving NaN will be another value, so their neutral element is set to NaN. --- mlir/lib/Dialect/Arith/IR/ArithOps.cpp | 14 +++ .../Linalg/transform-op-split-reduction.mlir | 92 +++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp index a0b50251c6b670..5797c5681a5fdd 100644 --- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp +++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp @@ -2467,6 +2467,12 @@ TypedAttr mlir::arith::getIdentityValueAttr(AtomicRMWKind kind, Type resultType, : APFloat::getInf(semantic, /*Negative=*/true); return builder.getFloatAttr(resultType, identity); } + case AtomicRMWKind::maxnumf: { + const llvm::fltSemantics &semantic = + llvm::cast(resultType).getFloatSemantics(); + APFloat identity = APFloat::getNaN(semantic, /*Negative=*/true); + return builder.getFloatAttr(resultType, identity); + } case AtomicRMWKind::addf: case AtomicRMWKind::addi: case AtomicRMWKind::maxu: @@ -2489,6 +2495,12 @@ TypedAttr mlir::arith::getIdentityValueAttr(AtomicRMWKind kind, Type resultType, return builder.getFloatAttr(resultType, identity); } + case AtomicRMWKind::minnumf: { + const llvm::fltSemantics &semantic = + llvm::cast(resultType).getFloatSemantics(); + APFloat identity = APFloat::getNaN(semantic, /*Negative=*/false); + return builder.getFloatAttr(resultType, identity); + } case AtomicRMWKind::mins: return builder.getIntegerAttr( resultType, APInt::getSignedMaxValue( @@ -2518,6 +2530,8 @@ std::optional mlir::arith::getNeutralElement(Operation *op) { .Case([](arith::MulFOp op) { return AtomicRMWKind::mulf; }) .Case([](arith::MaximumFOp op) { return AtomicRMWKind::maximumf; }) .Case([](arith::MinimumFOp op) { return AtomicRMWKind::minimumf; }) + .Case([](arith::MaxNumFOp op) { return AtomicRMWKind::maxnumf; }) + .Case([](arith::MinNumFOp op) { return AtomicRMWKind::minnumf; }) // Integer operations. .Case([](arith::AddIOp op) { return AtomicRMWKind::addi; }) .Case([](arith::OrIOp op) { return AtomicRMWKind::ori; }) diff --git a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir index 31e9fd00cffa04..9849f36285b160 100644 --- a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir @@ -407,3 +407,95 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- +// Checks we use nan as the neutral element for maxnumf op. +func.func @generic_split_maxnumf(%in: tensor<32xf32>, %out: tensor) -> tensor { + %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, + affine_map<(d0) -> ()>], + iterator_types = ["reduction"]} + ins(%in : tensor<32xf32>) + outs(%out : tensor) { + ^bb0(%arg1: f32, %arg2: f32): + %y = arith.maxnumf %arg1, %arg2 : f32 + linalg.yield %y : f32 + } -> tensor + return %r : tensor +} + +// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)> +// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)> +// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (d0)> +// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0) -> ()> +// CHECK-LABEL: func @generic_split_maxnumf +// The float value 0xFFC00000 that is filled into the init tensor represents negative NaN. +// CHECK-DAG: %[[ID:.*]] = arith.constant 0xFFC00000 : f32 +// CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] output_shape [8, 4] : tensor<32xf32> into tensor<8x4xf32> +// CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32> +// CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32> +// CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]} +// CHECK-SAME: ins(%[[I1]] : tensor<8x4xf32>) outs(%[[F]] : tensor<4xf32>) { +// CHECK: arith.maxnumf +// CHECK: linalg.yield +// CHECK: } -> tensor<4xf32> +// CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP3]]], iterator_types = ["reduction"]} +// CHECK-SAME: ins(%[[G]] : tensor<4xf32>) outs(%{{.*}} : tensor) { +// CHECK: arith.maxnumf {{.*}} +// CHECK: linalg.yield +// CHECK: } -> tensor +// CHECK: return %[[R]] : tensor + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1:4 = transform.structured.split_reduction %0 { split_factor = 4, insert_split_dimension = 0, inner_parallel} + : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- +// Checks we use nan as the neutral element for minnumf op. +func.func @generic_split_minnumf(%in: tensor<32xf32>, %out: tensor) -> tensor { + %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, + affine_map<(d0) -> ()>], + iterator_types = ["reduction"]} + ins(%in : tensor<32xf32>) + outs(%out : tensor) { + ^bb0(%arg1: f32, %arg2: f32): + %y = arith.minnumf %arg1, %arg2 : f32 + linalg.yield %y : f32 + } -> tensor + return %r : tensor +} + +// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)> +// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)> +// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (d0)> +// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0) -> ()> +// CHECK-LABEL: func @generic_split_minnumf +// The float value 0x7FC00000 that is filled into the init tensor represents positive NaN. +// CHECK-DAG: %[[ID:.*]] = arith.constant 0x7FC00000 : f32 +// CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] output_shape [8, 4] : tensor<32xf32> into tensor<8x4xf32> +// CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32> +// CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32> +// CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]} +// CHECK-SAME: ins(%[[I1]] : tensor<8x4xf32>) outs(%[[F]] : tensor<4xf32>) { +// CHECK: arith.minnumf +// CHECK: linalg.yield +// CHECK: } -> tensor<4xf32> +// CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP3]]], iterator_types = ["reduction"]} +// CHECK-SAME: ins(%[[G]] : tensor<4xf32>) outs(%{{.*}} : tensor) { +// CHECK: arith.minnumf {{.*}} +// CHECK: linalg.yield +// CHECK: } -> tensor +// CHECK: return %[[R]] : tensor + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1:4 = transform.structured.split_reduction %0 { split_factor = 4, insert_split_dimension = 0, inner_parallel} + : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} From 799316ff26cc82d60f276dc62c4a69b5bba1aef3 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 29 May 2024 14:26:00 +0000 Subject: [PATCH 136/230] [lldb][NFC] Pass Stream& to ToXML methods in RegisterFlags As suggested in a review of some new code for this file, Stream is more general. The code does not need to know that it's backed by a string. --- lldb/include/lldb/Target/RegisterFlags.h | 6 +++--- lldb/source/Target/RegisterFlags.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lldb/include/lldb/Target/RegisterFlags.h b/lldb/include/lldb/Target/RegisterFlags.h index 9b343e445678ab..29a47540cd4f5b 100644 --- a/lldb/include/lldb/Target/RegisterFlags.h +++ b/lldb/include/lldb/Target/RegisterFlags.h @@ -15,7 +15,7 @@ namespace lldb_private { -class StreamString; +class Stream; class Log; class RegisterFlags { @@ -56,7 +56,7 @@ class RegisterFlags { /// Output XML that describes this field, to be inserted into a target XML /// file. Reserved characters in field names like "<" are replaced with /// their XML safe equivalents like ">". - void ToXML(StreamString &strm) const; + void ToXML(Stream &strm) const; bool operator<(const Field &rhs) const { return GetStart() < rhs.GetStart(); @@ -119,7 +119,7 @@ class RegisterFlags { std::string AsTable(uint32_t max_width) const; // Output XML that describes this set of flags. - void ToXML(StreamString &strm) const; + void ToXML(Stream &strm) const; private: const std::string m_id; diff --git a/lldb/source/Target/RegisterFlags.cpp b/lldb/source/Target/RegisterFlags.cpp index b1669b85fd2fe7..5274960587bf37 100644 --- a/lldb/source/Target/RegisterFlags.cpp +++ b/lldb/source/Target/RegisterFlags.cpp @@ -190,7 +190,7 @@ std::string RegisterFlags::AsTable(uint32_t max_width) const { return table; } -void RegisterFlags::ToXML(StreamString &strm) const { +void RegisterFlags::ToXML(Stream &strm) const { // Example XML: // // @@ -213,7 +213,7 @@ void RegisterFlags::ToXML(StreamString &strm) const { strm.Indent("\n"); } -void RegisterFlags::Field::ToXML(StreamString &strm) const { +void RegisterFlags::Field::ToXML(Stream &strm) const { // Example XML: // strm.Indent(); From 975477e7f7ee1d8c29975224abb452f73b90db36 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 29 May 2024 16:36:52 +0200 Subject: [PATCH 137/230] [CGBuiltin] Explicitly use inbounds GEP (NFCI) All of these are inbounds as they access known offsets in fixed globals. NFCI because constant expression construction currently already infers this, this patch just makes it explicit. --- clang/lib/CodeGen/CGBuiltin.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a3c65105033247..266bf41fd5577c 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14074,7 +14074,7 @@ Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) { // Grab the appropriate field from __cpu_model. llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, Index)}; - llvm::Value *CpuValue = Builder.CreateGEP(STy, CpuModel, Idxs); + llvm::Value *CpuValue = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs); CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue, CharUnits::fromQuantity(4)); @@ -14116,7 +14116,7 @@ CodeGenFunction::EmitX86CpuSupports(std::array FeatureMask) { // global in the struct STy. Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3), Builder.getInt32(0)}; - Value *CpuFeatures = Builder.CreateGEP(STy, CpuModel, Idxs); + Value *CpuFeatures = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs); Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures, CharUnits::fromQuantity(4)); @@ -14137,7 +14137,7 @@ CodeGenFunction::EmitX86CpuSupports(std::array FeatureMask) { continue; Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)}; Value *Features = Builder.CreateAlignedLoad( - Int32Ty, Builder.CreateGEP(ATy, CpuFeatures2, Idxs), + Int32Ty, Builder.CreateInBoundsGEP(ATy, CpuFeatures2, Idxs), CharUnits::fromQuantity(4)); // Check the value of the bit corresponding to the feature requested. Value *Mask = Builder.getInt32(M); @@ -16724,7 +16724,7 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID, llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, FieldIdx)}; - FieldValue = Builder.CreateGEP(STy, SysConf, Idxs); + FieldValue = Builder.CreateInBoundsGEP(STy, SysConf, Idxs); FieldValue = Builder.CreateAlignedLoad(Int32Ty, FieldValue, CharUnits::fromQuantity(4)); } else if (SupportMethod == SYS_CALL) { From 6127f15e5b4834411e8f2e700e25c40490deec35 Mon Sep 17 00:00:00 2001 From: zhijian lin Date: Wed, 29 May 2024 10:53:00 -0400 Subject: [PATCH 138/230] [PowerPC] option `-msoft-float` should not block the PC-relative address instruction (#92543) The Prefix instruction is introduced on PowerPC ISA3_1. In the PR, 1. The `FeaturePrefixInstrs` do not imply the `FeatureP8Vector` ,`FeatureP9Vector` . 2. `FeaturePrefixInstrs` implies only the FeatureISA3_1. 3. For the prefix instructions `paddi` and `pli` , they have `Predicates = [PrefixInstrs] ` 4. For the prefix instructions `plfs` and `plfd`, they have `Predicates = [PrefixInstrs, HasFPU] ` 5. For the prefix instructions "plxv` , "plxssp` and `plxsd` , they have `Predicates = [PrefixInstrs, HasP10Vector]` Fixes #62372 --- llvm/lib/Target/PowerPC/PPC.td | 3 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 19 +-- llvm/lib/Target/PowerPC/PPCInstrP10.td | 129 ++++++++++++-------- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 2 +- llvm/test/CodeGen/PowerPC/pr62372.ll | 13 ++ 5 files changed, 104 insertions(+), 62 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/pr62372.ll diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 639771ab9eabbd..84ef582c029d39 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -296,8 +296,7 @@ def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units", def FeaturePrefixInstrs : SubtargetFeature<"prefix-instrs", "HasPrefixInstrs", "true", "Enable prefixed instructions", - [FeatureISA3_0, FeatureP8Vector, - FeatureP9Altivec]>; + [FeatureISA3_1]>; def FeaturePCRelativeMemops : SubtargetFeature<"pcrelative-memops", "HasPCRelativeMemops", "true", "Enable PC relative Memory Ops", diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8450ce9e0e3b3b..a0e91f4dc3a4a7 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9460,7 +9460,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // double. This is to exploit the XXSPLTIDP instruction. // If we lose precision, we use XXSPLTI32DX. if (BVNIsConstantSplat && (SplatBitSize == 64) && - Subtarget.hasPrefixInstrs()) { + Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) { // Check the type first to short-circuit so we don't modify APSplatBits if // this block isn't executed. if ((Op->getValueType(0) == MVT::v2f64) && @@ -9605,11 +9605,11 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be // turned into a 4-byte splat of 0xABABABAB. - if (Subtarget.hasPrefixInstrs() && SplatSize == 2) + if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 2) return getCanonicalConstSplat(SplatBits | (SplatBits << 16), SplatSize * 2, Op.getValueType(), DAG, dl); - if (Subtarget.hasPrefixInstrs() && SplatSize == 4) + if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 4) return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG, dl); @@ -10242,7 +10242,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } - if (Subtarget.hasPrefixInstrs()) { + if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) { SDValue SplatInsertNode; if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG))) return SplatInsertNode; @@ -17730,7 +17730,7 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, return false; case MVT::f32: case MVT::f64: { - if (Subtarget.hasPrefixInstrs()) { + if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) { // we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP. return true; } @@ -18314,11 +18314,12 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N, // Compute subtarget flags. if (!Subtarget.hasP9Vector()) FlagSet |= PPC::MOF_SubtargetBeforeP9; - else { + else FlagSet |= PPC::MOF_SubtargetP9; - if (Subtarget.hasPrefixInstrs()) - FlagSet |= PPC::MOF_SubtargetP10; - } + + if (Subtarget.hasPrefixInstrs()) + FlagSet |= PPC::MOF_SubtargetP10; + if (Subtarget.hasSPE()) FlagSet |= PPC::MOF_SubtargetSPE; diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index 5f2937d47a5195..2fd5978a23c80f 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -654,13 +654,10 @@ let Predicates = [PrefixInstrs] in { (ins s34imm:$SI), "pli $RT, $SI", IIC_IntSimple, []>; } +} +let Predicates = [PrefixInstrs, HasFPU] in { let mayLoad = 1, mayStore = 0 in { - defm PLXV : - 8LS_DForm_R_SI34_XT6_RA5_MEM_p<25, (outs vsrc:$XST), (ins (memri34 $D, $RA):$addr), - (ins (memri34_pcrel $D, $RA):$addr), - (ins s34imm_pcrel:$D), - "plxv $XST, $addr", "plxv $XST, $D", IIC_LdStLFD>; defm PLFS : MLS_DForm_R_SI34_RTA5_MEM_p<48, (outs f4rc:$RST), (ins (memri34 $D, $RA):$addr), (ins (memri34_pcrel $D, $RA):$addr), @@ -671,6 +668,28 @@ let Predicates = [PrefixInstrs] in { (ins (memri34_pcrel $D, $RA):$addr), (ins s34imm_pcrel:$D), "plfd $RST, $addr", "plfd $RST, $D", IIC_LdStLFD>; + } + let mayStore = 1, mayLoad = 0 in { + defm PSTFS : + MLS_DForm_R_SI34_RTA5_MEM_p<52, (outs), (ins f4rc:$RST, (memri34 $D, $RA):$addr), + (ins f4rc:$RST, (memri34_pcrel $D, $RA):$addr), + (ins f4rc:$RST, s34imm_pcrel:$D), + "pstfs $RST, $addr", "pstfs $RST, $D", IIC_LdStLFD>; + defm PSTFD : + MLS_DForm_R_SI34_RTA5_MEM_p<54, (outs), (ins f8rc:$RST, (memri34 $D, $RA):$addr), + (ins f8rc:$RST, (memri34_pcrel $D, $RA):$addr), + (ins f8rc:$RST, s34imm_pcrel:$D), + "pstfd $RST, $addr", "pstfd $RST, $D", IIC_LdStLFD>; + } +} + +let Predicates = [PrefixInstrs, HasP10Vector] in { + let mayLoad = 1, mayStore = 0 in { + defm PLXV : + 8LS_DForm_R_SI34_XT6_RA5_MEM_p<25, (outs vsrc:$XST), (ins (memri34 $D, $RA):$addr), + (ins (memri34_pcrel $D, $RA):$addr), + (ins s34imm_pcrel:$D), + "plxv $XST, $addr", "plxv $XST, $D", IIC_LdStLFD>; defm PLXSSP : 8LS_DForm_R_SI34_RTA5_MEM_p<43, (outs vfrc:$RST), (ins (memri34 $D, $RA):$addr), (ins (memri34_pcrel $D, $RA):$addr), @@ -683,6 +702,28 @@ let Predicates = [PrefixInstrs] in { (ins s34imm_pcrel:$D), "plxsd $RST, $addr", "plxsd $RST, $D", IIC_LdStLFD>; + } + let mayStore = 1, mayLoad = 0 in { + defm PSTXV : + 8LS_DForm_R_SI34_XT6_RA5_MEM_p<27, (outs), (ins vsrc:$XST, (memri34 $D, $RA):$addr), + (ins vsrc:$XST, (memri34_pcrel $D, $RA):$addr), + (ins vsrc:$XST, s34imm_pcrel:$D), + "pstxv $XST, $addr", "pstxv $XST, $D", IIC_LdStLFD>; + defm PSTXSSP : + 8LS_DForm_R_SI34_RTA5_MEM_p<47, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr), + (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr), + (ins vfrc:$RST, s34imm_pcrel:$D), + "pstxssp $RST, $addr", "pstxssp $RST, $D", IIC_LdStLFD>; + defm PSTXSD : + 8LS_DForm_R_SI34_RTA5_MEM_p<46, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr), + (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr), + (ins vfrc:$RST, s34imm_pcrel:$D), + "pstxsd $RST, $addr", "pstxsd $RST, $D", IIC_LdStLFD>; + } +} + +let Predicates = [PrefixInstrs] in { + let mayLoad = 1, mayStore = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { defm PLBZ8 : MLS_DForm_R_SI34_RTA5_MEM_p<34, (outs g8rc:$RST), (ins (memri34 $D, $RA):$addr), @@ -745,31 +786,6 @@ let Predicates = [PrefixInstrs] in { } let mayStore = 1, mayLoad = 0 in { - defm PSTXV : - 8LS_DForm_R_SI34_XT6_RA5_MEM_p<27, (outs), (ins vsrc:$XST, (memri34 $D, $RA):$addr), - (ins vsrc:$XST, (memri34_pcrel $D, $RA):$addr), - (ins vsrc:$XST, s34imm_pcrel:$D), - "pstxv $XST, $addr", "pstxv $XST, $D", IIC_LdStLFD>; - defm PSTFS : - MLS_DForm_R_SI34_RTA5_MEM_p<52, (outs), (ins f4rc:$RST, (memri34 $D, $RA):$addr), - (ins f4rc:$RST, (memri34_pcrel $D, $RA):$addr), - (ins f4rc:$RST, s34imm_pcrel:$D), - "pstfs $RST, $addr", "pstfs $RST, $D", IIC_LdStLFD>; - defm PSTFD : - MLS_DForm_R_SI34_RTA5_MEM_p<54, (outs), (ins f8rc:$RST, (memri34 $D, $RA):$addr), - (ins f8rc:$RST, (memri34_pcrel $D, $RA):$addr), - (ins f8rc:$RST, s34imm_pcrel:$D), - "pstfd $RST, $addr", "pstfd $RST, $D", IIC_LdStLFD>; - defm PSTXSSP : - 8LS_DForm_R_SI34_RTA5_MEM_p<47, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr), - (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr), - (ins vfrc:$RST, s34imm_pcrel:$D), - "pstxssp $RST, $addr", "pstxssp $RST, $D", IIC_LdStLFD>; - defm PSTXSD : - 8LS_DForm_R_SI34_RTA5_MEM_p<46, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr), - (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr), - (ins vfrc:$RST, s34imm_pcrel:$D), - "pstxsd $RST, $addr", "pstxsd $RST, $D", IIC_LdStLFD>; let Interpretation64Bit = 1, isCodeGenOnly = 1 in { defm PSTB8 : MLS_DForm_R_SI34_RTA5_MEM_p<38, (outs), (ins g8rc:$RST, (memri34 $D, $RA):$addr), @@ -1136,7 +1152,7 @@ let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops] in { []>; } -let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] in { +let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in { defm PLXVP : 8LS_DForm_R_XTp5_SI34_MEM_p<58, (outs vsrprc:$XTp), (ins (memri34 $D, $RA):$addr), (ins (memri34_pcrel $D, $RA):$addr), @@ -1145,7 +1161,7 @@ let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] i IIC_LdStLFD>; } -let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] in { +let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in { defm PSTXVP : 8LS_DForm_R_XTp5_SI34_MEM_p<62, (outs), (ins vsrprc:$XTp, (memri34 $D, $RA):$addr), (ins vsrprc:$XTp, (memri34_pcrel $D, $RA):$addr), @@ -1157,7 +1173,7 @@ let Predicates = [PairedVectorMemops] in { // Intrinsics for Paired Vector Loads. def : Pat<(v256i1 (int_ppc_vsx_lxvp DQForm:$src)), (LXVP memrix16:$src)>; def : Pat<(v256i1 (int_ppc_vsx_lxvp XForm:$src)), (LXVPX XForm:$src)>; - let Predicates = [PairedVectorMemops, PrefixInstrs] in { + let Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in { def : Pat<(v256i1 (int_ppc_vsx_lxvp PDForm:$src)), (PLXVP memri34:$src)>; } // Intrinsics for Paired Vector Stores. @@ -1165,7 +1181,7 @@ let Predicates = [PairedVectorMemops] in { (STXVP $XSp, memrix16:$dst)>; def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, XForm:$dst), (STXVPX $XSp, XForm:$dst)>; - let Predicates = [PairedVectorMemops, PrefixInstrs] in { + let Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in { def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, PDForm:$dst), (PSTXVP $XSp, memri34:$dst)>; } @@ -1236,6 +1252,9 @@ let Predicates = [PCRelativeMemops] in { def : Pat<(store i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)), (PSTDpc $RS, $ga, 0)>; +} + +let Predicates = [PCRelativeMemops, HasFPU] in { // Load f32 def : Pat<(f32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLFSpc $addr, 0)>; @@ -1252,6 +1271,11 @@ let Predicates = [PCRelativeMemops] in { def : Pat<(store f64:$FRS, (PPCmatpcreladdr PCRelForm:$ga)), (PSTFDpc $FRS, $ga, 0)>; + def : Pat<(v4f32 (PPCldvsxlh (PPCmatpcreladdr PCRelForm:$addr))), + (SUBREG_TO_REG (i64 1), (PLFDpc $addr, 0), sub_64)>; +} + +let Predicates = [PCRelativeMemops, HasP10Vector] in { // Load f128 def : Pat<(f128 (load (PPCmatpcreladdr PCRelForm:$addr))), (COPY_TO_REGCLASS (PLXVpc $addr, 0), VRRC)>; @@ -1288,6 +1312,14 @@ let Predicates = [PCRelativeMemops] in { def : Pat<(store v2f64:$XS, (PPCmatpcreladdr PCRelForm:$ga)), (PSTXVpc $XS, $ga, 0)>; + // Special Cases For PPCstore_scal_int_from_vsr + def : Pat<(PPCstore_scal_int_from_vsr f64:$src, (PPCmatpcreladdr PCRelForm:$dst), 8), + (PSTXSDpc $src, $dst, 0)>; + def : Pat<(PPCstore_scal_int_from_vsr f128:$src, (PPCmatpcreladdr PCRelForm:$dst), 8), + (PSTXSDpc (COPY_TO_REGCLASS $src, VFRC), $dst, 0)>; +} + +let Predicates = [PCRelativeMemops] in { // Atomic Load def : Pat<(i32 (atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga))), (PLBZpc $ga, 0)>; @@ -1314,15 +1346,6 @@ let Predicates = [PCRelativeMemops] in { def : Pat<(atomic_store_64 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)), (PSTDpc $RS, $ga, 0)>; - // Special Cases For PPCstore_scal_int_from_vsr - def : Pat<(PPCstore_scal_int_from_vsr f64:$src, (PPCmatpcreladdr PCRelForm:$dst), 8), - (PSTXSDpc $src, $dst, 0)>; - def : Pat<(PPCstore_scal_int_from_vsr f128:$src, (PPCmatpcreladdr PCRelForm:$dst), 8), - (PSTXSDpc (COPY_TO_REGCLASS $src, VFRC), $dst, 0)>; - - def : Pat<(v4f32 (PPCldvsxlh (PPCmatpcreladdr PCRelForm:$addr))), - (SUBREG_TO_REG (i64 1), (PLFDpc $addr, 0), sub_64)>; - // If the PPCmatpcreladdr node is not caught by any other pattern it should be // caught here and turned into a paddi instruction to materialize the address. def : Pat<(PPCmatpcreladdr PCRelForm:$addr), (PADDI8pc 0, $addr)>; @@ -1335,7 +1358,7 @@ let Predicates = [PCRelativeMemops] in { (PADDI8 $in, $addr)>; } -let Predicates = [PrefixInstrs] in { +let Predicates = [PrefixInstrs, HasP10Vector] in { def XXPERMX : 8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC, u3imm:$IMM), @@ -2142,7 +2165,7 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { class xxevalPattern imm> : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} -let AddedComplexity = 400, Predicates = [PrefixInstrs] in { +let AddedComplexity = 400, Predicates = [PrefixInstrs, HasP10Vector] in { def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A, i32immNonAllOneNonZero:$A, i32immNonAllOneNonZero:$A, @@ -2279,7 +2302,7 @@ def : Pat<(f64 nzFPImmAsi64:$A), (PSTXSD (COPY_TO_REGCLASS $src, VFRC), PDForm:$dst)>; } -let Predicates = [PrefixInstrs] in { +let Predicates = [PrefixInstrs, HasP10Vector] in { def : Pat<(i32 imm34:$imm), (PLI (getImmAs64BitInt imm:$imm))>; def : Pat<(i64 imm34:$imm), (PLI8 (getImmAs64BitInt imm:$imm))>; def : Pat<(v16i8 (int_ppc_vsx_xxpermx v16i8:$A, v16i8:$B, v16i8:$C, timm:$D)), @@ -2300,7 +2323,9 @@ let Predicates = [PrefixInstrs] in { (XXBLENDVW $A, $B, $C)>; def : Pat<(int_ppc_vsx_xxblendvd v2i64:$A, v2i64:$B, v2i64:$C), (XXBLENDVD $A, $B, $C)>; +} +let Predicates = [PrefixInstrs] in { // Anonymous patterns to select prefixed loads and stores. // Load i32 def : Pat<(i32 (extloadi1 PDForm:$src)), (PLBZ memri34:$src)>; @@ -2335,7 +2360,9 @@ let Predicates = [PrefixInstrs] in { def : Pat<(truncstorei16 i64:$rS, PDForm:$dst), (PSTH8 g8rc:$rS, memri34:$dst)>; def : Pat<(truncstorei32 i64:$rS, PDForm:$dst), (PSTW8 g8rc:$rS, memri34:$dst)>; def : Pat<(store i64:$rS, PDForm:$dst), (PSTD g8rc:$rS, memri34:$dst)>; +} +let Predicates = [PrefixInstrs, HasFPU] in { // Load / Store f32 def : Pat<(f32 (load PDForm:$src)), (PLFS memri34:$src)>; def : Pat<(store f32:$FRS, PDForm:$dst), (PSTFS $FRS, memri34:$dst)>; @@ -2345,7 +2372,13 @@ let Predicates = [PrefixInstrs] in { (COPY_TO_REGCLASS (PLFS memri34:$src), VSFRC)>; def : Pat<(f64 (load PDForm:$src)), (PLFD memri34:$src)>; def : Pat<(store f64:$FRS, PDForm:$dst), (PSTFD $FRS, memri34:$dst)>; + // Prefixed fpext to v2f64 + def : Pat<(v4f32 (PPCldvsxlh PDForm:$src)), + (SUBREG_TO_REG (i64 1), (PLFD PDForm:$src), sub_64)>; +} + +let Predicates = [PrefixInstrs] in { // Atomic Load def : Pat<(i32 (atomic_load_8 PDForm:$src)), (PLBZ memri34:$src)>; def : Pat<(i32 (atomic_load_16 PDForm:$src)), (PLHZ memri34:$src)>; @@ -2357,10 +2390,6 @@ let Predicates = [PrefixInstrs] in { def : Pat<(atomic_store_16 i32:$RS, PDForm:$dst), (PSTH $RS, memri34:$dst)>; def : Pat<(atomic_store_32 i32:$RS, PDForm:$dst), (PSTW $RS, memri34:$dst)>; def : Pat<(atomic_store_64 i64:$RS, PDForm:$dst), (PSTD $RS, memri34:$dst)>; - - // Prefixed fpext to v2f64 - def : Pat<(v4f32 (PPCldvsxlh PDForm:$src)), - (SUBREG_TO_REG (i64 1), (PLFD PDForm:$src), sub_64)>; } def InsertEltShift { diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 7e4cd6c72aa87a..9e8da59615dfb3 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -1695,7 +1695,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // transform it to the prefixed version so we don't have to use the XForm. if ((OpC == PPC::LXVP || OpC == PPC::STXVP) && (!isInt<16>(Offset) || (Offset % offsetMinAlign(MI)) != 0) && - Subtarget.hasPrefixInstrs()) { + Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) { unsigned NewOpc = OpC == PPC::LXVP ? PPC::PLXVP : PPC::PSTXVP; MI.setDesc(TII.get(NewOpc)); OpC = NewOpc; diff --git a/llvm/test/CodeGen/PowerPC/pr62372.ll b/llvm/test/CodeGen/PowerPC/pr62372.ll new file mode 100644 index 00000000000000..8df236adc92d7b --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/pr62372.ll @@ -0,0 +1,13 @@ +; RUN: llc -ppc-asm-full-reg-names -mcpu=pwr10 -mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -o - %s | FileCheck %s + +@bar = dso_local global i32 0, align 4 + +define dso_local ptr @foo() #0 { +entry: + ret ptr @bar +} + +attributes #0 = { "use-soft-float"="true" } + +; CHECK: paddi r3, 0, bar@PCREL, 1 From cd5045a76a02f87542b2ff3d78352c10aee6395b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 29 May 2024 08:04:32 -0700 Subject: [PATCH 139/230] [ValueTypes] Use bit instead of int for boolean fields in ValueTypes.td. NFC --- llvm/include/llvm/CodeGen/ValueTypes.td | 10 +++++----- llvm/utils/TableGen/VTEmitter.cpp | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index e322cc04c1c769..0d8eaf1b2b1dd0 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -18,11 +18,11 @@ class ValueType { int Value = value; int nElem = 1; ValueType ElementType = ?; - int isOverloaded = false; - int isInteger = false; - int isFP = false; - int isVector = false; - int isScalable = false; + bit isOverloaded = false; + bit isInteger = false; + bit isFP = false; + bit isVector = false; + bit isScalable = false; } class VTAny : ValueType<0, value> { diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/VTEmitter.cpp index 5ec1f59318f784..9174fe48b62a9d 100644 --- a/llvm/utils/TableGen/VTEmitter.cpp +++ b/llvm/utils/TableGen/VTEmitter.cpp @@ -68,10 +68,10 @@ void VTEmitter::run(raw_ostream &OS) { continue; auto Name = VT->getValueAsString("LLVMName"); auto Value = VT->getValueAsInt("Value"); - bool IsInteger = VT->getValueAsInt("isInteger"); - bool IsFP = VT->getValueAsInt("isFP"); - bool IsVector = VT->getValueAsInt("isVector"); - bool IsScalable = VT->getValueAsInt("isScalable"); + bool IsInteger = VT->getValueAsBit("isInteger"); + bool IsFP = VT->getValueAsBit("isFP"); + bool IsVector = VT->getValueAsBit("isVector"); + bool IsScalable = VT->getValueAsBit("isScalable"); UpdateVTRange("INTEGER_FIXEDLEN_VECTOR_VALUETYPE", Name, IsInteger && IsVector && !IsScalable); @@ -92,7 +92,7 @@ void VTEmitter::run(raw_ostream &OS) { << Name << ", " << Value << ", " << VT->getValueAsInt("Size") << ", " - << VT->getValueAsInt("isOverloaded") << ", " + << VT->getValueAsBit("isOverloaded") << ", " << (IsInteger ? Name[0] == 'i' ? 3 : 1 : 0) << ", " << (IsFP ? Name[0] == 'f' ? 3 : 1 : 0) << ", " << IsVector << ", " @@ -111,14 +111,14 @@ void VTEmitter::run(raw_ostream &OS) { OS << "#ifdef GET_VT_VECATTR // (Ty, Sc, nElem, ElTy, ElSz)\n"; for (const auto *VT : VTsByNumber) { - if (!VT || !VT->getValueAsInt("isVector")) + if (!VT || !VT->getValueAsBit("isVector")) continue; const auto *ElTy = VT->getValueAsDef("ElementType"); assert(ElTy); // clang-format off OS << " GET_VT_VECATTR(" << VT->getValueAsString("LLVMName") << ", " - << VT->getValueAsInt("isScalable") << ", " + << VT->getValueAsBit("isScalable") << ", " << VT->getValueAsInt("nElem") << ", " << ElTy->getName() << ", " << ElTy->getValueAsInt("Size") << ")\n"; From b15a0a37404f36bcd9c7995de8cd16f9cb5ac8af Mon Sep 17 00:00:00 2001 From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com> Date: Wed, 29 May 2024 11:16:18 -0400 Subject: [PATCH 140/230] [clang] Add tanf16 builtin and support for tan constrained intrinsic (#93314) In LLVM, the `llvm.experimental.constrained.cos` and `llvm.experimental.constrained.sin` intrinsics are used for performing cosine and sine calculations with additional constraints on floating-point operations. This behavior is expected for all floating-point math intrinsics. This change adds these constraints for the `tan` intrinsic. - `Builtins.td` - replace TanF128 with F16F128MathTemplate - `CGBuiltin.cpp` - map existing tan builtins to `tan` and `constrained_tan` intrinsic - `ConstrainedOps.def` map tan and constrained_tan to an ISDOpcode. - `ISDOpcodes.h` - define tan and strict tan opcodes resolves #91421 --- clang/include/clang/Basic/Builtins.td | 6 ++-- clang/lib/CodeGen/CGBuiltin.cpp | 12 +++++++ clang/test/CodeGen/X86/math-builtins.c | 8 ++--- .../test/CodeGen/constrained-math-builtins.c | 13 +++++++ clang/test/CodeGen/math-libcalls.c | 12 +++---- clang/test/CodeGenOpenCL/builtins-f16.cl | 3 ++ llvm/docs/LangRef.rst | 36 +++++++++++++++++++ llvm/include/llvm/CodeGen/ISDOpcodes.h | 2 ++ llvm/include/llvm/IR/ConstrainedOps.def | 1 + llvm/include/llvm/IR/Intrinsics.td | 4 +++ llvm/test/Assembler/fp-intrinsics-attr.ll | 8 +++++ llvm/test/Feature/fp-intrinsics.ll | 11 ++++++ 12 files changed, 103 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 11982af3fa609b..7bef5fd7ad40f2 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -482,11 +482,11 @@ def SqrtF16F128 : Builtin, F16F128MathTemplate { let Prototype = "T(T)"; } -def TanF128 : Builtin { - let Spellings = ["__builtin_tanf128"]; +def TanF16F128 : Builtin, F16F128MathTemplate { + let Spellings = ["__builtin_tan"]; let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions]; - let Prototype = "__float128(__float128)"; + let Prototype = "T(T)"; } def TanhF128 : Builtin { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 266bf41fd5577c..94a7036f6233cc 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2923,6 +2923,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, SetSqrtFPAccuracy(Call); return RValue::get(Call); } + + case Builtin::BItan: + case Builtin::BItanf: + case Builtin::BItanl: + case Builtin::BI__builtin_tan: + case Builtin::BI__builtin_tanf: + case Builtin::BI__builtin_tanf16: + case Builtin::BI__builtin_tanl: + case Builtin::BI__builtin_tanf128: + return RValue::get(emitUnaryMaybeConstrainedFPBuiltin( + *this, E, Intrinsic::tan, Intrinsic::experimental_constrained_tan)); + case Builtin::BItrunc: case Builtin::BItruncf: case Builtin::BItruncl: diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c index 093239b4482609..1e0f129b986102 100644 --- a/clang/test/CodeGen/X86/math-builtins.c +++ b/clang/test/CodeGen/X86/math-builtins.c @@ -674,10 +674,10 @@ __builtin_sqrt(f); __builtin_sqrtf(f); __builtin_sqrtl(f); __builtin_ __builtin_tan(f); __builtin_tanf(f); __builtin_tanl(f); __builtin_tanf128(f); -// NO__ERRNO: declare double @tan(double noundef) [[READNONE]] -// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]] -// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]] -// NO__ERRNO: declare fp128 @tanf128(fp128 noundef) [[READNONE]] +// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare fp128 @llvm.tan.f128(fp128) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]] diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c index 2de832dd2b6cae..6cc3a10a1e7946 100644 --- a/clang/test/CodeGen/constrained-math-builtins.c +++ b/clang/test/CodeGen/constrained-math-builtins.c @@ -183,6 +183,14 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _ // CHECK: call x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") // CHECK: call fp128 @llvm.experimental.constrained.sqrt.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") + __builtin_tan(f); __builtin_tanf(f); __builtin_tanl(f); __builtin_tanf128(f); + +// CHECK: call double @llvm.experimental.constrained.tan.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") +// CHECK: call float @llvm.experimental.constrained.tan.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") +// CHECK: call x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") +// CHECK: call fp128 @llvm.experimental.constrained.tan.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") + + __builtin_trunc(f); __builtin_truncf(f); __builtin_truncl(f); __builtin_truncf128(f); // CHECK: call double @llvm.experimental.constrained.trunc.f64(double %{{.*}}, metadata !"fpexcept.strict") @@ -315,6 +323,11 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _ // CHECK: declare x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80, metadata, metadata) // CHECK: declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata) +// CHECK: declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) +// CHECK: declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata) +// CHECK: declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata) +// CHECK: declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata) + // CHECK: declare double @llvm.experimental.constrained.trunc.f64(double, metadata) // CHECK: declare float @llvm.experimental.constrained.trunc.f32(float, metadata) // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index 29c312ba0ecac2..a249182692762d 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -662,15 +662,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { tan(f); tanf(f); tanl(f); -// NO__ERRNO: declare double @tan(double noundef) [[READNONE]] -// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]] -// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]] +// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]] -// HAS_MAYTRAP: declare double @tan(double noundef) [[NOT_READNONE]] -// HAS_MAYTRAP: declare float @tanf(float noundef) [[NOT_READNONE]] -// HAS_MAYTRAP: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare double @llvm.experimental.constrained.tan.f64( +// HAS_MAYTRAP: declare float @llvm.experimental.constrained.tan.f32( +// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.tan.f80( tanh(f); tanhf(f); tanhl(f); diff --git a/clang/test/CodeGenOpenCL/builtins-f16.cl b/clang/test/CodeGenOpenCL/builtins-f16.cl index adf7cdde154f51..d7bffdad5c548f 100644 --- a/clang/test/CodeGenOpenCL/builtins-f16.cl +++ b/clang/test/CodeGenOpenCL/builtins-f16.cl @@ -66,6 +66,9 @@ void test_half_builtins(half h0, half h1, half h2, int i0) { // CHECK: call half @llvm.sqrt.f16(half %h0) res = __builtin_sqrtf16(h0); + // CHECK: call half @llvm.tan.f16(half %h0) + res = __builtin_tanf16(h0); + // CHECK: call half @llvm.trunc.f16(half %h0) res = __builtin_truncf16(h0); diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 7b64c477d13c7f..a650692d44d76e 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -26229,6 +26229,42 @@ same values as the libm ``cos`` functions would, and handles error conditions in the same way. +'``llvm.experimental.constrained.tan``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.tan( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.tan``' intrinsic returns the tangent of the +first operand. + +Arguments: +"""""""""" + +The first argument and the return type are floating-point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the tangent of the specified operand, returning the +same values as the libm ``tan`` functions would, and handles error +conditions in the same way. + + '``llvm.experimental.constrained.exp``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index d8af97957e48ec..22062f0efbbda1 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -415,6 +415,7 @@ enum NodeType { STRICT_FLDEXP, STRICT_FSIN, STRICT_FCOS, + STRICT_FTAN, STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, @@ -934,6 +935,7 @@ enum NodeType { FCBRT, FSIN, FCOS, + FTAN, FPOW, FPOWI, /// FLDEXP - ldexp, inspired by libm (op0 * 2**op1). diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def index 41aa44de957f93..a7b37c5cb204da 100644 --- a/llvm/include/llvm/IR/ConstrainedOps.def +++ b/llvm/include/llvm/IR/ConstrainedOps.def @@ -95,6 +95,7 @@ DAG_FUNCTION(round, 1, 0, experimental_constrained_round, FROUND) DAG_FUNCTION(roundeven, 1, 0, experimental_constrained_roundeven, FROUNDEVEN) DAG_FUNCTION(sin, 1, 1, experimental_constrained_sin, FSIN) DAG_FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT) +DAG_FUNCTION(tan, 1, 1, experimental_constrained_tan, FTAN) DAG_FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC) // This is definition for fmuladd intrinsic function, that is converted into diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 107442623ab7bd..4c506a6ace23ea 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1218,6 +1218,10 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn, IntrStrictFP] in [ LLVMMatchType<0>, llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_tan : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; def int_experimental_constrained_pow : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], [ LLVMMatchType<0>, LLVMMatchType<0>, diff --git a/llvm/test/Assembler/fp-intrinsics-attr.ll b/llvm/test/Assembler/fp-intrinsics-attr.ll index 6546d1a275c99f..613630e1a2b4d2 100644 --- a/llvm/test/Assembler/fp-intrinsics-attr.ll +++ b/llvm/test/Assembler/fp-intrinsics-attr.ll @@ -85,6 +85,11 @@ define void @func(double %a, double %b, double %c, i32 %i) strictfp { metadata !"round.dynamic", metadata !"fpexcept.strict") + %tan = call double @llvm.experimental.constrained.tan.f64( + double %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + %pow = call double @llvm.experimental.constrained.pow.f64( double %a, double %b, metadata !"round.dynamic", @@ -244,6 +249,9 @@ declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) ; CHECK: @llvm.experimental.constrained.cos.f64({{.*}}) #[[ATTR1]] +declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) +; CHECK: @llvm.experimental.constrained.tan.f64({{.*}}) #[[ATTR1]] + declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata) ; CHECK: @llvm.experimental.constrained.pow.f64({{.*}}) #[[ATTR1]] diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll index b92408a1bf1cd5..7759813dc2e114 100644 --- a/llvm/test/Feature/fp-intrinsics.ll +++ b/llvm/test/Feature/fp-intrinsics.ll @@ -151,6 +151,17 @@ entry: ret double %result } +; Verify that tan(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: ftan +; CHECK: call double @llvm.experimental.constrained.tan +define double @ftan() #0 { +entry: + %result = call double @llvm.experimental.constrained.tan.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f10 ; CHECK: call double @llvm.experimental.constrained.exp From fe82a3da36196157c0caa1ef2505186782f750d1 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 29 May 2024 16:16:08 +0100 Subject: [PATCH 141/230] Revert "[Support] Remove terminfo dependency (#92865)" This reverts commit 6bf450c7a60fa62c642e39836566da94bb9bbc91. It breaks LLDB CI: https://green.lab.llvm.org/job/llvm.org/view/LLDB/job/as-lldb-cmake/4762/execution/node/97/log/ ``` /Applications/Xcode-beta.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/c++ -Wdocumentation -fPIC -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -Wno-deprecated-declarations -Wno-unknown-pragmas -Wno-strict-aliasing -Wno-deprecated-register -Wno-vla-extension -O3 -DNDEBUG -arch arm64 -isysroot /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.2.sdk -mmacosx-version-min=14.1 -Wl,-search_paths_first -Wl,-headerpad_max_install_names -Wl,-dead_strip -Wl,-no_warn_duplicate_libraries tools/lldb/unittests/Editline/CMakeFiles/EditlineTests.dir/EditlineTest.cpp.o -o tools/lldb/unittests/Editline/EditlineTests lib/libLLVMSupport.a lib/libllvm_gtest_main.a lib/libllvm_gtest.a lib/liblldbHost.a lib/liblldbUtility.a lib/libLLVMTestingSupport.a /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.2.sdk/usr/lib/libxml2.tbd /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.2.sdk/usr/lib/libedit.tbd lib/liblldbHostMacOSXObjCXX.a lib/liblldbUtility.a -framework Foundation -framework CoreFoundation -framework CoreServices -framework Security lib/libLLVMObject.a lib/libLLVMIRReader.a lib/libLLVMBitReader.a lib/libLLVMAsmParser.a lib/libLLVMCore.a lib/libLLVMRemarks.a lib/libLLVMBitstreamReader.a lib/libLLVMMCParser.a lib/libLLVMMC.a lib/libLLVMDebugInfoCodeView.a lib/libLLVMTextAPI.a lib/libLLVMBinaryFormat.a lib/libLLVMTargetParser.a lib/libllvm_gtest.a lib/libLLVMSupport.a -lm /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.2.sdk/usr/lib/libz.tbd /opt/homebrew/lib/libzstd.dylib lib/libLLVMDemangle.a -lpthread && cd /Users/ec2-user/jenkins/workspace/llvm.org/as-lldb-cmake/lldb-build/tools/lldb/unittests/Editline && /opt/homebrew/Cellar/cmake/3.28.3/bin/cmake -E make_directory /Users/ec2-user/jenkins/workspace/llvm.org/as-lldb-cmake/lldb-build/tools/lldb/unittests/Editline/./Inputs ld: Undefined symbols: _setupterm, referenced from: lldb_private::Editline::Editline(char const*, __sFILE*, __sFILE*, __sFILE*, std::__1::recursive_mutex&) in liblldbHost.a[35](Editline.cpp.o) clang: error: linker command failed with exit code 1 (use -v to see invocation) ``` --- clang/cmake/caches/Fuchsia-stage2.cmake | 1 + clang/cmake/caches/Fuchsia.cmake | 7 +++ clang/cmake/caches/VectorEngine.cmake | 4 +- clang/utils/analyzer/entrypoint.py | 2 +- compiler-rt/cmake/config-ix.cmake | 15 +++++ .../symbolizer/scripts/build_symbolizer.sh | 1 + compiler-rt/lib/xray/tests/CMakeLists.txt | 5 ++ lldb/docs/resources/build.rst | 1 + lldb/source/Core/CMakeLists.txt | 3 + llvm/CMakeLists.txt | 2 + llvm/cmake/config-ix.cmake | 10 ++++ llvm/cmake/modules/FindTerminfo.cmake | 55 +++++++++++++++++ llvm/cmake/modules/LLVMConfig.cmake.in | 5 ++ llvm/docs/ReleaseNotes.rst | 4 -- llvm/include/llvm/Config/config.h.cmake | 3 + llvm/lib/Support/CMakeLists.txt | 11 ++++ llvm/lib/Support/Unix/Process.inc | 60 +++++++++++++++++-- llvm/utils/gn/README.rst | 2 +- llvm/utils/gn/build/libs/terminfo/BUILD.gn | 12 ++++ llvm/utils/gn/build/libs/terminfo/enable.gni | 4 ++ .../llvm/include/llvm/Config/BUILD.gn | 7 +++ .../gn/secondary/llvm/lib/Support/BUILD.gn | 1 + .../secondary/llvm/tools/llvm-config/BUILD.gn | 6 +- utils/bazel/.bazelrc | 3 + .../llvm/include/llvm/Config/config.h | 3 + utils/bazel/llvm_configs/config.h.cmake | 3 + 26 files changed, 218 insertions(+), 12 deletions(-) create mode 100644 llvm/cmake/modules/FindTerminfo.cmake create mode 100644 llvm/utils/gn/build/libs/terminfo/BUILD.gn create mode 100644 llvm/utils/gn/build/libs/terminfo/enable.gni diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index 66e764968e85ce..d5546e20873b3c 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -19,6 +19,7 @@ set(LLVM_ENABLE_LLD ON CACHE BOOL "") set(LLVM_ENABLE_LTO ON CACHE BOOL "") set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(LLVM_ENABLE_PLUGINS OFF CACHE BOOL "") +set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "") set(LLVM_ENABLE_UNWIND_TABLES OFF CACHE BOOL "") set(LLVM_ENABLE_Z3_SOLVER OFF CACHE BOOL "") set(LLVM_ENABLE_ZLIB ON CACHE BOOL "") diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake index 4d3af3ad3f4031..30a3b9116a461f 100644 --- a/clang/cmake/caches/Fuchsia.cmake +++ b/clang/cmake/caches/Fuchsia.cmake @@ -12,6 +12,7 @@ set(LLVM_ENABLE_DIA_SDK OFF CACHE BOOL "") set(LLVM_ENABLE_LIBEDIT OFF CACHE BOOL "") set(LLVM_ENABLE_LIBXML2 OFF CACHE BOOL "") set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") +set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "") set(LLVM_ENABLE_UNWIND_TABLES OFF CACHE BOOL "") set(LLVM_ENABLE_Z3_SOLVER OFF CACHE BOOL "") set(LLVM_ENABLE_ZLIB OFF CACHE BOOL "") @@ -33,6 +34,7 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH LibXml2_ROOT LLVM_ENABLE_CURL LLVM_ENABLE_HTTPLIB + LLVM_ENABLE_TERMINFO LLVM_ENABLE_LIBEDIT CURL_ROOT OpenSSL_ROOT @@ -45,6 +47,11 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH CURSES_LIBRARIES PANEL_LIBRARIES + # Deprecated + Terminfo_ROOT + + Terminfo_LIBRARIES + # Deprecated LibEdit_ROOT diff --git a/clang/cmake/caches/VectorEngine.cmake b/clang/cmake/caches/VectorEngine.cmake index b429fb0997d7a0..2f968a21cc407e 100644 --- a/clang/cmake/caches/VectorEngine.cmake +++ b/clang/cmake/caches/VectorEngine.cmake @@ -13,7 +13,9 @@ # ninja # -# Disable ZLIB, and ZSTD for VE since there is no pre-compiled libraries. +# Disable TERMINFO, ZLIB, and ZSTD for VE since there is no pre-compiled +# libraries. +set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "") set(LLVM_ENABLE_ZLIB OFF CACHE BOOL "") set(LLVM_ENABLE_ZSTD OFF CACHE BOOL "") diff --git a/clang/utils/analyzer/entrypoint.py b/clang/utils/analyzer/entrypoint.py index 4deb42db0a0b1f..ff877060bad69e 100644 --- a/clang/utils/analyzer/entrypoint.py +++ b/clang/utils/analyzer/entrypoint.py @@ -54,7 +54,7 @@ def is_cmake_needed(): "cmake -G Ninja -DCMAKE_BUILD_TYPE=Release " "-DCMAKE_INSTALL_PREFIX=/analyzer -DLLVM_TARGETS_TO_BUILD=X86 " '-DLLVM_ENABLE_PROJECTS="clang;openmp" -DLLVM_BUILD_RUNTIME=OFF ' - "-DCLANG_ENABLE_ARCMT=OFF " + "-DLLVM_ENABLE_TERMINFO=OFF -DCLANG_ENABLE_ARCMT=OFF " "-DCLANG_ENABLE_STATIC_ANALYZER=ON" ) diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index bddaa37579fd7b..42edbe15edafb5 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -182,6 +182,21 @@ check_library_exists(m pow "" COMPILER_RT_HAS_LIBM) check_library_exists(pthread pthread_create "" COMPILER_RT_HAS_LIBPTHREAD) check_library_exists(execinfo backtrace "" COMPILER_RT_HAS_LIBEXECINFO) +# Look for terminfo library, used in unittests that depend on LLVMSupport. +if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON) + set(MAYBE_REQUIRED REQUIRED) +else() + set(MAYBE_REQUIRED) +endif() +if(LLVM_ENABLE_TERMINFO) + find_library(COMPILER_RT_TERMINFO_LIB NAMES terminfo tinfo curses ncurses ncursesw ${MAYBE_REQUIRED}) +endif() +if(COMPILER_RT_TERMINFO_LIB) + set(LLVM_ENABLE_TERMINFO 1) +else() + set(LLVM_ENABLE_TERMINFO 0) +endif() + if (ANDROID AND COMPILER_RT_HAS_LIBDL) # Android's libstdc++ has a dependency on libdl. list(APPEND CMAKE_REQUIRED_LIBRARIES dl) diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh index b4702339db59cc..005bd6d584c593 100755 --- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh +++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh @@ -139,6 +139,7 @@ if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then -DLLVM_INCLUDE_TESTS=OFF \ -DLLVM_ENABLE_ZLIB=ON \ -DLLVM_ENABLE_ZSTD=OFF \ + -DLLVM_ENABLE_TERMINFO=OFF \ -DLLVM_ENABLE_THREADS=OFF \ $LLVM_SRC fi diff --git a/compiler-rt/lib/xray/tests/CMakeLists.txt b/compiler-rt/lib/xray/tests/CMakeLists.txt index 4c7e92b6ecc3d2..0a428b9a30b18b 100644 --- a/compiler-rt/lib/xray/tests/CMakeLists.txt +++ b/compiler-rt/lib/xray/tests/CMakeLists.txt @@ -54,6 +54,11 @@ set(XRAY_UNITTEST_LINK_FLAGS ${COMPILER_RT_CXX_LINK_LIBS}) if (NOT APPLE) + # Needed by LLVMSupport. + append_list_if( + LLVM_ENABLE_TERMINFO + -l${COMPILER_RT_TERMINFO_LIB} XRAY_UNITTEST_LINK_FLAGS) + # We add the library directories one at a time in our CFLAGS. foreach (DIR ${LLVM_LIBRARY_DIR}) list(APPEND XRAY_UNITTEST_LINK_FLAGS -L${DIR}) diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst index 33b6a6f79def4b..09d3d15a940836 100644 --- a/lldb/docs/resources/build.rst +++ b/lldb/docs/resources/build.rst @@ -477,6 +477,7 @@ further by passing the appropriate cmake options, such as: -DLLDB_ENABLE_PYTHON=0 -DLLDB_ENABLE_LIBEDIT=0 -DLLDB_ENABLE_CURSES=0 + -DLLVM_ENABLE_TERMINFO=0 (see :ref:`Optional Dependencies` for more) diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt index dbc620b91b1ed1..471fd9c1a33e59 100644 --- a/lldb/source/Core/CMakeLists.txt +++ b/lldb/source/Core/CMakeLists.txt @@ -11,6 +11,9 @@ set(LLDB_LIBEDIT_LIBS) if (LLDB_ENABLE_CURSES) list(APPEND LLDB_CURSES_LIBS ${PANEL_LIBRARIES} ${CURSES_LIBRARIES}) + if(LLVM_ENABLE_TERMINFO) + list(APPEND LLDB_CURSES_LIBS ${Terminfo_LIBRARIES}) + endif() if (LLVM_BUILD_STATIC) list(APPEND LLDB_CURSES_LIBS gpm) endif() diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 64898ab09772f4..cbf4db60a6e185 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -539,6 +539,8 @@ set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should sear set(LLVM_TARGET_ARCH "host" CACHE STRING "Set target to use for LLVM JIT or use \"host\" for automatic detection.") +option(LLVM_ENABLE_TERMINFO "Use terminfo database if available." ON) + set(LLVM_ENABLE_LIBXML2 "ON" CACHE STRING "Use libxml2 if available. Can be ON, OFF, or FORCE_ON") option(LLVM_ENABLE_LIBEDIT "Use libedit if available." ON) diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 0aae13e30f2ab4..7d2a49337e1e86 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -240,11 +240,21 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*") else() set(HAVE_LIBEDIT 0) endif() + if(LLVM_ENABLE_TERMINFO) + if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON) + find_package(Terminfo REQUIRED) + else() + find_package(Terminfo) + endif() + set(LLVM_ENABLE_TERMINFO "${Terminfo_FOUND}") + endif() else() set(HAVE_LIBEDIT 0) + set(LLVM_ENABLE_TERMINFO 0) endif() else() set(HAVE_LIBEDIT 0) + set(LLVM_ENABLE_TERMINFO 0) endif() if(LLVM_HAS_LOGF128) diff --git a/llvm/cmake/modules/FindTerminfo.cmake b/llvm/cmake/modules/FindTerminfo.cmake new file mode 100644 index 00000000000000..163af669706771 --- /dev/null +++ b/llvm/cmake/modules/FindTerminfo.cmake @@ -0,0 +1,55 @@ +# Attempts to discover terminfo library with a linkable setupterm function. +# +# Example usage: +# +# find_package(Terminfo) +# +# If successful, the following variables will be defined: +# Terminfo_FOUND +# Terminfo_LIBRARIES +# +# Additionally, the following import target will be defined: +# Terminfo::terminfo + +find_library(Terminfo_LIBRARIES NAMES terminfo tinfo curses ncurses ncursesw) + +if(Terminfo_LIBRARIES) + include(CMakePushCheckState) + cmake_push_check_state() + list(APPEND CMAKE_REQUIRED_LIBRARIES ${Terminfo_LIBRARIES}) + set(Terminfo_LINKABLE_SRC [=[ + #ifdef __cplusplus + extern "C" { + #endif + int setupterm(char *term, int filedes, int *errret); + #ifdef __cplusplus + } + #endif + int main(void) { return setupterm(0, 0, 0); } + ]=]) + if(DEFINED CMAKE_C_COMPILER) + include(CheckCSourceCompiles) + check_c_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE) + else() + include(CheckCXXSourceCompiles) + check_cxx_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE) + endif() + cmake_pop_check_state() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Terminfo + FOUND_VAR + Terminfo_FOUND + REQUIRED_VARS + Terminfo_LIBRARIES + Terminfo_LINKABLE) +mark_as_advanced(Terminfo_LIBRARIES + Terminfo_LINKABLE) + +if(Terminfo_FOUND) + if(NOT TARGET Terminfo::terminfo) + add_library(Terminfo::terminfo UNKNOWN IMPORTED) + set_target_properties(Terminfo::terminfo PROPERTIES IMPORTED_LOCATION "${Terminfo_LIBRARIES}") + endif() +endif() diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in index 7e1501a89354c8..397bd5815b64e9 100644 --- a/llvm/cmake/modules/LLVMConfig.cmake.in +++ b/llvm/cmake/modules/LLVMConfig.cmake.in @@ -60,6 +60,11 @@ if(LLVM_ENABLE_LIBEDIT) find_package(LibEdit) endif() +set(LLVM_ENABLE_TERMINFO @LLVM_ENABLE_TERMINFO@) +if(LLVM_ENABLE_TERMINFO) + find_package(Terminfo) +endif() + set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@) set(LLVM_ENABLE_UNWIND_TABLES @LLVM_ENABLE_UNWIND_TABLES@) diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index c7c2c2825f58b9..1e1ccb495c3669 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -63,10 +63,6 @@ Changes to LLVM infrastructure Changes to building LLVM ------------------------ -- The ``LLVM_ENABLE_TERMINFO`` flag has been removed. LLVM no longer depends on - terminfo and now always uses the ``TERM`` environment variable for color - support autodetection. - Changes to TableGen ------------------- diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index ff30741c8f360a..977c182e9d2b0d 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -209,6 +209,9 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H} +/* Define if the setupterm() function is supported this platform. */ +#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO} + /* Define to 1 if you have the header file. */ #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H} diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index be4badc09efa58..03e888958a0711 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -56,6 +56,9 @@ elseif( CMAKE_HOST_UNIX ) STRING(REGEX REPLACE "^lib" "" Backtrace_LIBFILE ${Backtrace_LIBFILE}) set(system_libs ${system_libs} ${Backtrace_LIBFILE}) endif() + if( LLVM_ENABLE_TERMINFO ) + set(imported_libs ${imported_libs} Terminfo::terminfo) + endif() set(system_libs ${system_libs} ${LLVM_ATOMIC_LIB}) set(system_libs ${system_libs} ${LLVM_PTHREAD_LIB}) if( UNIX AND NOT (BEOS OR HAIKU) ) @@ -322,6 +325,14 @@ if(LLVM_ENABLE_ZSTD) set(llvm_system_libs ${llvm_system_libs} "${zstd_library}") endif() +if(LLVM_ENABLE_TERMINFO) + if(NOT terminfo_library) + get_property(terminfo_library TARGET Terminfo::terminfo PROPERTY LOCATION) + endif() + get_library_name(${terminfo_library} terminfo_library) + set(llvm_system_libs ${llvm_system_libs} "${terminfo_library}") +endif() + set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${llvm_system_libs}") diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc index 84b10ff5d1d08a..ae90924cae1b9b 100644 --- a/llvm/lib/Support/Unix/Process.inc +++ b/llvm/lib/Support/Unix/Process.inc @@ -341,9 +341,17 @@ unsigned Process::StandardErrColumns() { return getColumns(); } -static bool terminalHasColors() { - // Check if the current terminal is one of terminals that are known to support - // ANSI color escape codes. +#ifdef LLVM_ENABLE_TERMINFO +// We manually declare these extern functions because finding the correct +// headers from various terminfo, curses, or other sources is harder than +// writing their specs down. +extern "C" int setupterm(char *term, int filedes, int *errret); +extern "C" struct term *set_curterm(struct term *termp); +extern "C" int del_curterm(struct term *termp); +extern "C" int tigetnum(char *capname); +#endif + +bool checkTerminalEnvironmentForColors() { if (const char *TermStr = std::getenv("TERM")) { return StringSwitch(TermStr) .Case("ansi", true) @@ -360,10 +368,54 @@ static bool terminalHasColors() { return false; } +static bool terminalHasColors(int fd) { +#ifdef LLVM_ENABLE_TERMINFO + // First, acquire a global lock because these C routines are thread hostile. + static std::mutex TermColorMutex; + std::lock_guard G(TermColorMutex); + + struct term *previous_term = set_curterm(nullptr); + int errret = 0; + if (setupterm(nullptr, fd, &errret) != 0) + // Regardless of why, if we can't get terminfo, we shouldn't try to print + // colors. + return false; + + // Test whether the terminal as set up supports color output. How to do this + // isn't entirely obvious. We can use the curses routine 'has_colors' but it + // would be nice to avoid a dependency on curses proper when we can make do + // with a minimal terminfo parsing library. Also, we don't really care whether + // the terminal supports the curses-specific color changing routines, merely + // if it will interpret ANSI color escape codes in a reasonable way. Thus, the + // strategy here is just to query the baseline colors capability and if it + // supports colors at all to assume it will translate the escape codes into + // whatever range of colors it does support. We can add more detailed tests + // here if users report them as necessary. + // + // The 'tigetnum' routine returns -2 or -1 on errors, and might return 0 if + // the terminfo says that no colors are supported. + int colors_ti = tigetnum(const_cast("colors")); + bool HasColors = + colors_ti >= 0 ? colors_ti : checkTerminalEnvironmentForColors(); + + // Now extract the structure allocated by setupterm and free its memory + // through a really silly dance. + struct term *termp = set_curterm(previous_term); + (void)del_curterm(termp); // Drop any errors here. + + // Return true if we found a color capabilities for the current terminal. + return HasColors; +#else + // When the terminfo database is not available, check if the current terminal + // is one of terminals that are known to support ANSI color escape codes. + return checkTerminalEnvironmentForColors(); +#endif +} + bool Process::FileDescriptorHasColors(int fd) { // A file descriptor has colors if it is displayed and the terminal has // colors. - return FileDescriptorIsDisplayed(fd) && terminalHasColors(); + return FileDescriptorIsDisplayed(fd) && terminalHasColors(fd); } bool Process::StandardOutHasColors() { diff --git a/llvm/utils/gn/README.rst b/llvm/utils/gn/README.rst index 52d03be533e55e..9ca545061099d8 100644 --- a/llvm/utils/gn/README.rst +++ b/llvm/utils/gn/README.rst @@ -131,7 +131,7 @@ configure is used for three classes of feature checks: For the last two points, it would be nice if LLVM didn't have a single ``config.h`` header, but one header per toggle. That way, when e.g. -``llvm_enable_zlib`` is toggled, only the 3 files caring about that setting +``llvm_enable_terminfo`` is toggled, only the 3 files caring about that setting would need to be rebuilt, instead of everything including ``config.h``. GN doesn't believe in users setting arbitrary cflags from an environment diff --git a/llvm/utils/gn/build/libs/terminfo/BUILD.gn b/llvm/utils/gn/build/libs/terminfo/BUILD.gn new file mode 100644 index 00000000000000..10003d61c4df91 --- /dev/null +++ b/llvm/utils/gn/build/libs/terminfo/BUILD.gn @@ -0,0 +1,12 @@ +import("//llvm/utils/gn/build/libs/terminfo/enable.gni") + +config("terminfo_config") { + visibility = [ ":terminfo" ] + libs = [ "ncurses" ] +} + +group("terminfo") { + if (llvm_enable_terminfo) { + public_configs = [ ":terminfo_config" ] + } +} diff --git a/llvm/utils/gn/build/libs/terminfo/enable.gni b/llvm/utils/gn/build/libs/terminfo/enable.gni new file mode 100644 index 00000000000000..79ea2b601857ff --- /dev/null +++ b/llvm/utils/gn/build/libs/terminfo/enable.gni @@ -0,0 +1,4 @@ +declare_args() { + # Whether to link against terminfo. + llvm_enable_terminfo = false +} diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index d8266fee05014b..2da26d102e7723 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -10,6 +10,7 @@ import("//llvm/utils/gn/build/buildflags.gni") import("//llvm/utils/gn/build/libs/curl/enable.gni") import("//llvm/utils/gn/build/libs/edit/enable.gni") import("//llvm/utils/gn/build/libs/pthread/enable.gni") +import("//llvm/utils/gn/build/libs/terminfo/enable.gni") import("//llvm/utils/gn/build/libs/xar/enable.gni") import("//llvm/utils/gn/build/libs/xml/enable.gni") import("//llvm/utils/gn/build/libs/zlib/enable.gni") @@ -293,6 +294,12 @@ write_cmake_config("config") { values += [ "HAVE_LIBEDIT=" ] } + if (llvm_enable_terminfo) { + values += [ "LLVM_ENABLE_TERMINFO=1" ] + } else { + values += [ "LLVM_ENABLE_TERMINFO=" ] + } + if (llvm_enable_libxml2) { values += [ "LLVM_ENABLE_LIBXML2=1" ] } else { diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index 7728455499bf3d..941d448b3367c1 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -6,6 +6,7 @@ static_library("Support") { "//llvm/include/llvm/Support:write_vcsrevision", "//llvm/lib/Demangle", "//llvm/utils/gn/build/libs/pthread", + "//llvm/utils/gn/build/libs/terminfo", "//llvm/utils/gn/build/libs/zlib", ] diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn index 711e4e3b431511..bf50cd0fce46bd 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn @@ -1,6 +1,7 @@ import("//llvm/lib/Target/targets_string.gni") import("//llvm/utils/gn/build/buildflags.gni") import("//llvm/utils/gn/build/libs/pthread/enable.gni") +import("//llvm/utils/gn/build/libs/terminfo/enable.gni") import("//llvm/utils/gn/build/libs/xml/enable.gni") import("//llvm/utils/gn/build/libs/zlib/enable.gni") import("//llvm/utils/gn/build/write_cmake_config.gni") @@ -35,7 +36,7 @@ write_cmake_config("BuildVariables.inc") { lib = "" } - # Windows doesn't use any of libxml2, zlib by default. + # Windows doesn't use any of libxml2, terminfo, zlib by default. # Make GN not warn about these variables being unused. not_needed([ "l", @@ -62,6 +63,9 @@ write_cmake_config("BuildVariables.inc") { if (llvm_enable_libxml2) { system_libs += " ${l}xml2${lib}" } + if (llvm_enable_terminfo) { + system_libs += " ${l}ncurses${lib}" + } if (llvm_enable_zlib) { system_libs += " ${l}z${lib}" } diff --git a/utils/bazel/.bazelrc b/utils/bazel/.bazelrc index 09111bcdc834ec..5a6d1889076afa 100644 --- a/utils/bazel/.bazelrc +++ b/utils/bazel/.bazelrc @@ -51,6 +51,9 @@ build --experimental_cc_shared_library build:zlib_external --repo_env=BAZEL_LLVM_ZLIB_STRATEGY=external build:zlib_system --repo_env=BAZEL_LLVM_ZLIB_STRATEGY=system +build:terminfo_external --repo_env=BAZEL_LLVM_TERMINFO_STRATEGY=external +build:terminfo_system --repo_env=BAZEL_LLVM_TERMINFO_STRATEGY=system + ############################################################################### # Options for "generic_clang" builds: these options should generally apply to # builds using a Clang-based compiler, and default to the `clang` executable on diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h index a4fb47d677ab15..e9385f45c5e5cd 100644 --- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h +++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h @@ -222,6 +222,9 @@ /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 +/* Define if the setupterm() function is supported this platform. */ +/* LLVM_ENABLE_TERMINFO defined in Bazel */ + /* Define to 1 if you have the header file. */ #define HAVE_TERMIOS_H 1 diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake index ff30741c8f360a..977c182e9d2b0d 100644 --- a/utils/bazel/llvm_configs/config.h.cmake +++ b/utils/bazel/llvm_configs/config.h.cmake @@ -209,6 +209,9 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H} +/* Define if the setupterm() function is supported this platform. */ +#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO} + /* Define to 1 if you have the header file. */ #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H} From 1de6011c34b185235cd65c2e3fb030015d182968 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 29 May 2024 08:35:32 -0700 Subject: [PATCH 142/230] [ValueTypes] Remove hardcoded 224 from VTEmitter.cpp. NFC Add a new bit to ValueTypes.td to indicate whether a type should be part of the [FIRST_VALUETYPE,LAST_VALUETYPE] range or not. This was reviewed as part of #93654. --- llvm/include/llvm/CodeGen/ValueTypes.td | 6 ++++++ llvm/utils/TableGen/VTEmitter.cpp | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index 0d8eaf1b2b1dd0..a6981b0ffa13c2 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -23,6 +23,9 @@ class ValueType { bit isFP = false; bit isVector = false; bit isScalable = false; + // Indicates this VT should be included in the + // [FIRST_VALUETYPE,LAST_VALUETYPE] range. + bit isNormalValueType = true; } class VTAny : ValueType<0, value> { @@ -287,6 +290,7 @@ def aarch64svcount : ValueType<16, 199>; // AArch64 predicate-as-counter def spirvbuiltin : ValueType<0, 200>; // SPIR-V's builtin type +let isNormalValueType = false in { def token : ValueType<0, 248>; // TokenTy def MetadataVT : ValueType<0, 249> { // Metadata let LLVMName = "Metadata"; @@ -316,6 +320,8 @@ def iPTR : ValueType<0, 254>; // Should only be used in TableGen. def Any : VTAny<255>; +} // isNormalValueType = false + } // end defset ValueTypes /// This class is for targets that want to use pointer types in patterns diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/VTEmitter.cpp index 9174fe48b62a9d..64b54ed134232c 100644 --- a/llvm/utils/TableGen/VTEmitter.cpp +++ b/llvm/utils/TableGen/VTEmitter.cpp @@ -72,6 +72,7 @@ void VTEmitter::run(raw_ostream &OS) { bool IsFP = VT->getValueAsBit("isFP"); bool IsVector = VT->getValueAsBit("isVector"); bool IsScalable = VT->getValueAsBit("isScalable"); + bool IsNormalValueType = VT->getValueAsBit("isNormalValueType"); UpdateVTRange("INTEGER_FIXEDLEN_VECTOR_VALUETYPE", Name, IsInteger && IsVector && !IsScalable); @@ -85,7 +86,7 @@ void VTEmitter::run(raw_ostream &OS) { UpdateVTRange("VECTOR_VALUETYPE", Name, IsVector); UpdateVTRange("INTEGER_VALUETYPE", Name, IsInteger && !IsVector); UpdateVTRange("FP_VALUETYPE", Name, IsFP && !IsVector); - UpdateVTRange("VALUETYPE", Name, Value < 224); + UpdateVTRange("VALUETYPE", Name, IsNormalValueType); // clang-format off OS << " GET_VT_ATTR(" From 6d90ac1e06f31cae9806a8815158e2851cf8e987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Wed, 29 May 2024 18:05:33 +0200 Subject: [PATCH 143/230] [GlobalIsel] Combine freeze (#93239) --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 4 - .../include/llvm/Target/GlobalISel/Combine.td | 24 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 12 +- .../GlobalISel/CombinerHelperVectorOps.cpp | 53 - llvm/lib/CodeGen/GlobalISel/Utils.cpp | 102 +- .../GlobalISel/combine-extract-vec-elt.mir | 4 +- .../AArch64/GlobalISel/combine-freeze.mir | 1154 +++++++++++++++++ .../GlobalISel/combine-insert-vec-elt.mir | 6 +- ...galizer-combiner-divrem-insertpt-crash.mir | 3 +- llvm/test/CodeGen/AArch64/fast-isel-select.ll | 594 +++++++-- llvm/test/CodeGen/AMDGPU/div_i128.ll | 490 +++---- 11 files changed, 1971 insertions(+), 475 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 2111e82e1a99d2..2ddf20ebe7af72 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -840,10 +840,6 @@ class CombinerHelper { /// Combine extract vector element. bool matchExtractVectorElement(MachineInstr &MI, BuildFnTy &MatchInfo); - /// Combine extract vector element with freeze on the vector register. - bool matchExtractVectorElementWithFreeze(const MachineOperand &MO, - BuildFnTy &MatchInfo); - /// Combine extract vector element with a build vector on the vector register. bool matchExtractVectorElementWithBuildVector(const MachineOperand &MO, BuildFnTy &MatchInfo); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 8012f919227778..383589add7755c 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1532,13 +1532,6 @@ def extract_vector_element_build_vector_trunc8 : GICombineRule< [{ return Helper.matchExtractVectorElementWithBuildVectorTrunc(${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>; -def extract_vector_element_freeze : GICombineRule< - (defs root:$root, build_fn_matchinfo:$matchinfo), - (match (G_FREEZE $src, $input), - (G_EXTRACT_VECTOR_ELT $root, $src, $idx), - [{ return Helper.matchExtractVectorElementWithFreeze(${root}, ${matchinfo}); }]), - (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>; - def sext_trunc : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), (match (G_TRUNC $src, $x, (MIFlags NoSWrap)), @@ -1636,7 +1629,6 @@ extract_vector_element_build_vector_trunc5, extract_vector_element_build_vector_trunc6, extract_vector_element_build_vector_trunc7, extract_vector_element_build_vector_trunc8, -extract_vector_element_freeze, extract_vector_element_shuffle_vector, insert_vector_element_extract_vector_element ]>; @@ -1713,6 +1705,17 @@ def integer_reassoc_combines: GICombineGroup<[ APlusBMinusCPlusA ]>; +def freeze_of_non_undef_non_poison : GICombineRule< + (defs root:$root), + (match (G_FREEZE $root, $src), + [{ return isGuaranteedNotToBeUndefOrPoison(${src}.getReg(), MRI); }]), + (apply (GIReplaceReg $root, $src))>; + +def freeze_combines: GICombineGroup<[ + freeze_of_non_undef_non_poison, + push_freeze_to_prevent_poison_from_propagating +]>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -1771,7 +1774,7 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop, constant_fold_fp_binop]>; def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, - vector_ops_combines, + vector_ops_combines, freeze_combines, insert_vec_elt_combines, extract_vec_elt_combines, combines_for_extload, combine_extracted_vector_load, undef_combines, identity_combines, phi_combines, @@ -1793,8 +1796,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, sub_add_reg, select_to_minmax, redundant_binop_in_equality, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, combine_concat_vector, double_icmp_zero_and_or_combine, match_addos, - sext_trunc, zext_trunc, combine_shuffle_concat, - push_freeze_to_prevent_poison_from_propagating]>; + sext_trunc, zext_trunc, combine_shuffle_concat]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 4cc602b5c87092..dcc1335a4bd44b 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -265,11 +265,14 @@ bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand( } } - cast(OrigDef)->dropPoisonGeneratingFlags(); - // Eliminate freeze if all operands are guaranteed non-poison. if (!MaybePoisonOperand) { - MatchInfo = [=](MachineIRBuilder &B) { MRI.replaceRegWith(DstOp, OrigOp); }; + MatchInfo = [=](MachineIRBuilder &B) { + Observer.changingInstr(*OrigDef); + cast(OrigDef)->dropPoisonGeneratingFlags(); + Observer.changedInstr(*OrigDef); + B.buildCopy(DstOp, OrigOp); + }; return true; } @@ -277,6 +280,9 @@ bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand( LLT MaybePoisonOperandRegTy = MRI.getType(MaybePoisonOperandReg); MatchInfo = [=](MachineIRBuilder &B) mutable { + Observer.changingInstr(*OrigDef); + cast(OrigDef)->dropPoisonGeneratingFlags(); + Observer.changedInstr(*OrigDef); B.setInsertPt(*OrigDef->getParent(), OrigDef->getIterator()); auto Freeze = B.buildFreeze(MaybePoisonOperandRegTy, MaybePoisonOperandReg); replaceRegOpWith( diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp index 21b1eb26281742..b4765fb280f9dd 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp @@ -144,59 +144,6 @@ bool CombinerHelper::matchExtractVectorElementWithDifferentIndices( return false; } -bool CombinerHelper::matchExtractVectorElementWithFreeze( - const MachineOperand &MO, BuildFnTy &MatchInfo) { - MachineInstr *Root = getDefIgnoringCopies(MO.getReg(), MRI); - GExtractVectorElement *Extract = cast(Root); - - Register Vector = Extract->getVectorReg(); - - // - // %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) - // %freeze:_(<2 x s32>) = G_FREEZE %bv(<2 x s32>) - // %extract:_(s32) = G_EXTRACT_VECTOR_ELT %bv(<2 x s32>), %opaque(s64) - // - // --> - // - // %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) - // %extract:_(s32) = G_EXTRACT_VECTOR_ELT %bv(<2 x s32>), %opaque(s64) - // %freeze:_(s32) = G_FREEZE %extract(s32) - // - // - - // For G_FREEZE, the input and the output types are identical. Moving the - // freeze from the Vector into the front of the extract preserves the freeze - // semantics. The result is still freeze'd. Furthermore, the Vector register - // becomes easier to analyze. A build vector could have been hidden behind the - // freeze. - - // We expect a freeze on the Vector register. - GFreeze *Freeze = getOpcodeDef(Vector, MRI); - if (!Freeze) - return false; - - Register Dst = Extract->getReg(0); - LLT DstTy = MRI.getType(Dst); - - // We first have to check for one-use and legality of the freeze. - // The type of the extractVectorElement did not change. - if (!MRI.hasOneNonDBGUse(Freeze->getReg(0)) || - !isLegalOrBeforeLegalizer({TargetOpcode::G_FREEZE, {DstTy}})) - return false; - - Register Index = Extract->getIndexReg(); - - // We move the freeze from the Vector register in front of the - // extractVectorElement. - MatchInfo = [=](MachineIRBuilder &B) { - auto Extract = - B.buildExtractVectorElement(DstTy, Freeze->getSourceReg(), Index); - B.buildFreeze(Dst, Extract); - }; - - return true; -} - bool CombinerHelper::matchExtractVectorElementWithBuildVector( const MachineOperand &MO, BuildFnTy &MatchInfo) { MachineInstr *Root = getDefIgnoringCopies(MO.getReg(), MRI); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index f455482e02943f..e8438be94b3cd2 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1724,6 +1724,39 @@ bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) { } } +/// Shifts return poison if shiftwidth is larger than the bitwidth. +static bool shiftAmountKnownInRange(Register ShiftAmount, + const MachineRegisterInfo &MRI) { + LLT Ty = MRI.getType(ShiftAmount); + + if (Ty.isScalableVector()) + return false; // Can't tell, just return false to be safe + + if (Ty.isScalar()) { + std::optional Val = + getIConstantVRegValWithLookThrough(ShiftAmount, MRI); + if (!Val) + return false; + return Val->Value.ult(Ty.getScalarSizeInBits()); + } + + GBuildVector *BV = getOpcodeDef(ShiftAmount, MRI); + if (!BV) + return false; + + unsigned Sources = BV->getNumSources(); + for (unsigned I = 0; I < Sources; ++I) { + std::optional Val = + getIConstantVRegValWithLookThrough(BV->getSourceReg(I), MRI); + if (!Val) + return false; + if (!Val->Value.ult(Ty.getScalarSizeInBits())) + return false; + } + + return true; +} + namespace { enum class UndefPoisonKind { PoisonOnly = (1 << 0), @@ -1732,11 +1765,11 @@ enum class UndefPoisonKind { }; } -[[maybe_unused]] static bool includesPoison(UndefPoisonKind Kind) { +static bool includesPoison(UndefPoisonKind Kind) { return (unsigned(Kind) & unsigned(UndefPoisonKind::PoisonOnly)) != 0; } -[[maybe_unused]] static bool includesUndef(UndefPoisonKind Kind) { +static bool includesUndef(UndefPoisonKind Kind) { return (unsigned(Kind) & unsigned(UndefPoisonKind::UndefOnly)) != 0; } @@ -1745,18 +1778,55 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI, UndefPoisonKind Kind) { MachineInstr *RegDef = MRI.getVRegDef(Reg); - if (auto *GMI = dyn_cast(RegDef)) { - if (ConsiderFlagsAndMetadata && includesPoison(Kind) && - GMI->hasPoisonGeneratingFlags()) - return true; - } else { - // Conservatively return true. - return true; - } + if (ConsiderFlagsAndMetadata && includesPoison(Kind)) + if (auto *GMI = dyn_cast(RegDef)) + if (GMI->hasPoisonGeneratingFlags()) + return true; + // Check whether opcode is a poison/undef-generating operation. switch (RegDef->getOpcode()) { case TargetOpcode::G_FREEZE: + case TargetOpcode::G_BUILD_VECTOR: + case TargetOpcode::G_CONSTANT_FOLD_BARRIER: return false; + case TargetOpcode::G_SHL: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: + return includesPoison(Kind) && + !shiftAmountKnownInRange(RegDef->getOperand(2).getReg(), MRI); + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: + // fptosi/ui yields poison if the resulting value does not fit in the + // destination type. + return true; + case TargetOpcode::G_CTLZ: + case TargetOpcode::G_CTTZ: + case TargetOpcode::G_ABS: + case TargetOpcode::G_CTPOP: + case TargetOpcode::G_BSWAP: + case TargetOpcode::G_BITREVERSE: + case TargetOpcode::G_FSHL: + case TargetOpcode::G_FSHR: + case TargetOpcode::G_SMAX: + case TargetOpcode::G_SMIN: + case TargetOpcode::G_UMAX: + case TargetOpcode::G_UMIN: + case TargetOpcode::G_PTRMASK: + case TargetOpcode::G_SADDO: + case TargetOpcode::G_SSUBO: + case TargetOpcode::G_UADDO: + case TargetOpcode::G_USUBO: + case TargetOpcode::G_SMULO: + case TargetOpcode::G_UMULO: + case TargetOpcode::G_SADDSAT: + case TargetOpcode::G_UADDSAT: + case TargetOpcode::G_SSUBSAT: + case TargetOpcode::G_USUBSAT: + return false; + case TargetOpcode::G_SSHLSAT: + case TargetOpcode::G_USHLSAT: + return includesPoison(Kind) && + !shiftAmountKnownInRange(RegDef->getOperand(2).getReg(), MRI); default: return !isa(RegDef) && !isa(RegDef); } @@ -1776,6 +1846,18 @@ static bool isGuaranteedNotToBeUndefOrPoison(Register Reg, return true; case TargetOpcode::G_IMPLICIT_DEF: return !includesUndef(Kind); + case TargetOpcode::G_CONSTANT: + case TargetOpcode::G_FCONSTANT: + return true; + case TargetOpcode::G_BUILD_VECTOR: { + GBuildVector *BV = cast(RegDef); + unsigned NumSources = BV->getNumSources(); + for (unsigned I = 0; I < NumSources; ++I) + if (!::isGuaranteedNotToBeUndefOrPoison(BV->getSourceReg(I), MRI, + Depth + 1, Kind)) + return false; + return true; + } default: { auto MOCheck = [&](const MachineOperand &MO) { if (!MO.isReg()) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir index d5d33742148ada..70241e71aa593f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir @@ -361,8 +361,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %vec:_(<2 x s64>) = COPY $q0 ; CHECK-NEXT: %idx:_(s64) = COPY $x1 - ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT %vec(<2 x s64>), %idx(s64) - ; CHECK-NEXT: %extract:_(s64) = G_FREEZE [[EVEC]] + ; CHECK-NEXT: %fvec:_(<2 x s64>) = G_FREEZE %vec + ; CHECK-NEXT: %extract:_(s64) = G_EXTRACT_VECTOR_ELT %fvec(<2 x s64>), %idx(s64) ; CHECK-NEXT: $x0 = COPY %extract(s64) ; CHECK-NEXT: RET_ReallyLR implicit $x0 %vec:_(<2 x s64>) = COPY $q0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir new file mode 100644 index 00000000000000..5ec8ef5cdcb196 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir @@ -0,0 +1,1154 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s + +... +--- +name: freeze_register +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_register + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_FREEZE %0 + $x0 = COPY %1(s64) + RET_ReallyLR implicit $x0 + +... +--- +name: freeze_constant +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_constant + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: $x0 = COPY [[C]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %1:_(s64) = G_CONSTANT i64 9 + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 + +... +--- +name: freeze_fconstant +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_fconstant + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 9.000000e+00 + ; CHECK-NEXT: $x0 = COPY [[C]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %1:_(s64) = G_FCONSTANT double 9.0 + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_undef +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_undef + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[DEF]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %1:_(s64) = G_IMPLICIT_DEF + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_freeze +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_freeze + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_FREEZE %0 + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_buildvector +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_buildvector + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[BUILD_VECTOR]] + ; CHECK-NEXT: $q0 = COPY [[FREEZE]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(s32) = COPY $w0 + %1:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %0(s32), %0(s32), %0(s32) + %2:_(<4 x s32>) = G_FREEZE %1 + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 +... +--- +name: freeze_buildvector_const +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_buildvector_const + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %c:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR %c(s32), %c(s32), %c(s32), %c(s32) + ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(s32) = COPY $w0 + %c:_(s32) = G_CONSTANT i32 6 + %1:_(<4 x s32>) = G_BUILD_VECTOR %c(s32), %c(s32), %c(s32), %c(s32) + %2:_(<4 x s32>) = G_FREEZE %1 + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 +... +--- +name: freeze_disjoint_or_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_disjoint_or_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: $x0 = COPY %c(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = disjoint G_OR %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_or_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_or_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: $x0 = COPY %c(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_OR %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_nneg_zext_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_nneg_zext_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 9 + ; CHECK-NEXT: %c:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %c(s32) + ; CHECK-NEXT: $x0 = COPY [[ZEXT]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s32) = G_CONSTANT i32 9 + %c:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = nneg G_ZEXT %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_zext_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_zext_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 9 + ; CHECK-NEXT: %c:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %c(s32) + ; CHECK-NEXT: $x0 = COPY [[ZEXT]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s32) = G_CONSTANT i32 9 + %c:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_ZEXT %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_udiv_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_udiv_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV %c, %c + ; CHECK-NEXT: $x0 = COPY [[UDIV]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_UDIV %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_exact_udiv_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_exact_udiv_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV %c, %c + ; CHECK-NEXT: $x0 = COPY [[UDIV]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = exact G_UDIV %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_mul_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_mul_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL %c, %c + ; CHECK-NEXT: $x0 = COPY [[MUL]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_MUL %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_nsw_mul_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_nsw_mul_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL %c, %c + ; CHECK-NEXT: $x0 = COPY [[MUL]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = nsw G_MUL %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_trunc_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_trunc_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %c(s64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s32) = G_TRUNC %c + %2:_(s32) = G_FREEZE %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $q0 +... +--- +name: freeze_nuw_trunc_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_nuw_trunc_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %c(s64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s32) = nuw G_TRUNC %c + %2:_(s32) = G_FREEZE %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $q0 +... +--- +name: freeze_add_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_add_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %c, %c + ; CHECK-NEXT: $x0 = COPY [[ADD]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_ADD %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_nuw_add_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_nuw_add_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %c, %c + ; CHECK-NEXT: $x0 = COPY [[ADD]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = nuw G_ADD %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_xor_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_xor_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR %c, %c + ; CHECK-NEXT: $x0 = COPY [[XOR]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_XOR %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_fptosi_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_fptosi_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s64) = G_FPTOSI %c(s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[FPTOSI]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_FPTOSI %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_fptoui_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_fptoui_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s64) = G_FPTOUI %c(s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[FPTOUI]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_FPTOUI %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_shl_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_shl_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL %c, %c(s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[SHL]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_SHL %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_ashr_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_ashr_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR %c, %c(s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[ASHR]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_ASHR %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_lshr_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_lshr_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR %c, %c(s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[LSHR]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_LSHR %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_ctlz_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_ctlz_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ %c(s64) + ; CHECK-NEXT: $x0 = COPY [[CTLZ]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_CTLZ %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_cttz_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_cttz_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[CTTZ:%[0-9]+]]:_(s64) = G_CTTZ %c(s64) + ; CHECK-NEXT: $x0 = COPY [[CTTZ]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_CTTZ %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_abs_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_abs_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(s64) = G_ABS %c + ; CHECK-NEXT: $x0 = COPY [[ABS]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_ABS %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_bswap_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_bswap_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP %c + ; CHECK-NEXT: $x0 = COPY [[BSWAP]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_BSWAP %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_bitreverse_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_bitreverse_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[BITREVERSE:%[0-9]+]]:_(s64) = G_BITREVERSE %c + ; CHECK-NEXT: $x0 = COPY [[BITREVERSE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_BITREVERSE %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_icmp_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_icmp_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %cmp:_(s1) = G_ICMP intpred(eq), %c(s64), %d + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %cmp + ; CHECK-NEXT: %ext:_(s64) = G_ZEXT [[FREEZE]](s1) + ; CHECK-NEXT: $x0 = COPY %ext(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %cmp:_(s1) = G_ICMP intpred(eq), %c(s64), %d + %2:_(s1) = G_FREEZE %cmp + %ext:_(s64) = G_ZEXT %2(s1) + $x0 = COPY %ext(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_fcmp_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_fcmp_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %cmp:_(s1) = G_FCMP floatpred(oeq), %c(s64), %d + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %cmp + ; CHECK-NEXT: %ext:_(s64) = G_ZEXT [[FREEZE]](s1) + ; CHECK-NEXT: $x0 = COPY %ext(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %cmp:_(s1) = G_FCMP floatpred(oeq), %c(s64), %d + %2:_(s1) = G_FREEZE %cmp + %ext:_(s64) = G_ZEXT %2(s1) + $x0 = COPY %ext(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_fshl_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_fshl_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ROTL:%[0-9]+]]:_(s64) = G_ROTL %c, %c(s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[ROTL]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_FSHL %c, %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_fshr_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_fshr_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ROTR:%[0-9]+]]:_(s64) = G_ROTR %c, %c(s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[ROTR]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_FSHR %c, %c, %c + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_smax_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_smax_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX %c, %d + ; CHECK-NEXT: $x0 = COPY [[SMAX]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_SMAX %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_smin_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_smin_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN %c, %d + ; CHECK-NEXT: $x0 = COPY [[SMIN]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_SMIN %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_umax_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_umax_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[UMAX:%[0-9]+]]:_(s64) = G_UMAX %c, %d + ; CHECK-NEXT: $x0 = COPY [[UMAX]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_UMAX %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_umin_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_umin_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s64) = G_UMIN %c, %d + ; CHECK-NEXT: $x0 = COPY [[UMIN]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_UMIN %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_ptrmask_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_ptrmask_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %p:_(p0) = COPY $x0 + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(p0) = G_FREEZE %p + ; CHECK-NEXT: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[FREEZE]], %cst(s64) + ; CHECK-NEXT: $x0 = COPY [[PTRMASK]](p0) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %p:_(p0) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(p0) = G_PTRMASK %p, %cst + %2:_(p0) = G_FREEZE %1 + $x0 = COPY %2(p0) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_saddo_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_saddo_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %c, %d + ; CHECK-NEXT: $x0 = COPY [[ADD]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64), %o:_(s1) = G_SADDO %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_ssubo_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_ssubo_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %4:_(s64), %o:_(s1) = G_SSUBO %c, %d + ; CHECK-NEXT: $x0 = COPY %4(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64), %o:_(s1) = G_SSUBO %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_uaddo_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_uaddo_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %c, %d + ; CHECK-NEXT: $x0 = COPY [[ADD]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64), %o:_(s1) = G_UADDO %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_usubo_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_usubo_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %4:_(s64), %o:_(s1) = G_USUBO %c, %d + ; CHECK-NEXT: $x0 = COPY %4(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64), %o:_(s1) = G_USUBO %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_smulo_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_smulo_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %4:_(s64), %o:_(s1) = G_SMULO %c, %d + ; CHECK-NEXT: $x0 = COPY %4(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64), %o:_(s1) = G_SMULO %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_umulo_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_umulo_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %4:_(s64), %o:_(s1) = G_UMULO %c, %d + ; CHECK-NEXT: $x0 = COPY %4(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64), %o:_(s1) = G_UMULO %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_saddsat_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_saddsat_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[SADDSAT:%[0-9]+]]:_(s64) = G_SADDSAT %c, %d + ; CHECK-NEXT: $x0 = COPY [[SADDSAT]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_SADDSAT %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_uaddsat_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_uaddsat_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[UADDSAT:%[0-9]+]]:_(s64) = G_UADDSAT %c, %d + ; CHECK-NEXT: $x0 = COPY [[UADDSAT]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_UADDSAT %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_ssubsat_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_ssubsat_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[SSUBSAT:%[0-9]+]]:_(s64) = G_SSUBSAT %c, %d + ; CHECK-NEXT: $x0 = COPY [[SSUBSAT]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_SSUBSAT %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_usubsat_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_usubsat_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[USUBSAT:%[0-9]+]]:_(s64) = G_USUBSAT %c, %d + ; CHECK-NEXT: $x0 = COPY [[USUBSAT]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_USUBSAT %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_sshlsat_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_sshlsat_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[SSHLSAT:%[0-9]+]]:_(s64) = G_SSHLSAT %c, %d(s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[SSHLSAT]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_SSHLSAT %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 +... +--- +name: freeze_ushlsat_fold_barrier +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: freeze_ushlsat_fold_barrier + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9 + ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: [[USHLSAT:%[0-9]+]]:_(s64) = G_USHLSAT %c, %d(s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[USHLSAT]] + ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %cst:_(s64) = G_CONSTANT i64 9 + %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst + %1:_(s64) = G_USHLSAT %c, %d + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir index 0c67a867580ccd..c000a8e635bc6b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir @@ -253,10 +253,10 @@ body: | ; CHECK: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 127 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<32 x s8>) = G_FREEZE [[BUILD_VECTOR]] - ; CHECK-NEXT: G_STORE [[FREEZE]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>)) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[DEF]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8) + ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>)) ; CHECK-NEXT: RET_ReallyLR %3:_(s8) = G_CONSTANT i8 127 %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-crash.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-crash.mir index ca403f85156113..767ece62b8731f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-crash.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-crash.mir @@ -24,8 +24,7 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[C1]] - ; CHECK-NEXT: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[FREEZE]], [[C]] + ; CHECK-NEXT: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[C1]], [[C]] ; CHECK-NEXT: G_STORE [[UDIV]](s64), [[COPY]](p0) :: (store (s64)) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: diff --git a/llvm/test/CodeGen/AArch64/fast-isel-select.ll b/llvm/test/CodeGen/AArch64/fast-isel-select.ll index 6ad4a5ae572e0e..65701343ccc1e5 100644 --- a/llvm/test/CodeGen/AArch64/fast-isel-select.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-select.ll @@ -1,175 +1,382 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-apple-darwin -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=GISEL +; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FASTISEL +; RUN: llc -mtriple=aarch64-apple-darwin -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL ; First test the different supported value types for select. define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) { -; CHECK-LABEL: select_i1 -; CHECK: {{cmp w0, #0|tst w0, #0x1}} -; CHECK-NEXT: csel {{w[0-9]+}}, w1, w2, ne +; GISEL-LABEL: select_i1: +; GISEL: ; %bb.0: +; GISEL-NEXT: tst w0, #0x1 +; GISEL-NEXT: csel w0, w1, w2, ne +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_i1: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: tst w0, #0x1 +; CHECK-FASTISEL-NEXT: csel w8, w1, w2, ne +; CHECK-FASTISEL-NEXT: and w0, w8, #0x1 +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_i1: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: tst w0, #0x1 +; CHECK-GISEL-NEXT: csel w0, w1, w2, ne +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, i1 %a, i1 %b ret i1 %1 } define zeroext i8 @select_i8(i1 zeroext %c, i8 zeroext %a, i8 zeroext %b) { -; CHECK-LABEL: select_i8 -; CHECK: {{cmp w0, #0|tst w0, #0x1}} -; CHECK-NEXT: csel {{w[0-9]+}}, w1, w2, ne +; GISEL-LABEL: select_i8: +; GISEL: ; %bb.0: +; GISEL-NEXT: tst w0, #0x1 +; GISEL-NEXT: csel w0, w1, w2, ne +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_i8: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: tst w0, #0x1 +; CHECK-FASTISEL-NEXT: csel w8, w1, w2, ne +; CHECK-FASTISEL-NEXT: uxtb w0, w8 +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_i8: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: tst w0, #0x1 +; CHECK-GISEL-NEXT: csel w0, w1, w2, ne +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, i8 %a, i8 %b ret i8 %1 } define zeroext i16 @select_i16(i1 zeroext %c, i16 zeroext %a, i16 zeroext %b) { -; CHECK-LABEL: select_i16 -; CHECK: {{cmp w0, #0|tst w0, #0x1}} -; CHECK-NEXT: csel {{w[0-9]+}}, w1, w2, ne +; GISEL-LABEL: select_i16: +; GISEL: ; %bb.0: +; GISEL-NEXT: tst w0, #0x1 +; GISEL-NEXT: csel w0, w1, w2, ne +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_i16: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: tst w0, #0x1 +; CHECK-FASTISEL-NEXT: csel w8, w1, w2, ne +; CHECK-FASTISEL-NEXT: uxth w0, w8 +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_i16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: tst w0, #0x1 +; CHECK-GISEL-NEXT: csel w0, w1, w2, ne +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, i16 %a, i16 %b ret i16 %1 } define i32 @select_i32(i1 zeroext %c, i32 %a, i32 %b) { -; CHECK-LABEL: select_i32 -; CHECK: {{cmp w0, #0|tst w0, #0x1}} -; CHECK-NEXT: csel {{w[0-9]+}}, w1, w2, ne +; GISEL-LABEL: select_i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: tst w0, #0x1 +; GISEL-NEXT: csel w0, w1, w2, ne +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_i32: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: tst w0, #0x1 +; CHECK-FASTISEL-NEXT: csel w0, w1, w2, ne +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: tst w0, #0x1 +; CHECK-GISEL-NEXT: csel w0, w1, w2, ne +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, i32 %a, i32 %b ret i32 %1 } define i64 @select_i64(i1 zeroext %c, i64 %a, i64 %b) { -; CHECK-LABEL: select_i64 -; CHECK: {{cmp w0, #0|tst w0, #0x1}} -; CHECK-NEXT: csel {{x[0-9]+}}, x1, x2, ne +; GISEL-LABEL: select_i64: +; GISEL: ; %bb.0: +; GISEL-NEXT: tst w0, #0x1 +; GISEL-NEXT: csel x0, x1, x2, ne +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_i64: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: tst w0, #0x1 +; CHECK-FASTISEL-NEXT: csel x0, x1, x2, ne +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: tst w0, #0x1 +; CHECK-GISEL-NEXT: csel x0, x1, x2, ne +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, i64 %a, i64 %b ret i64 %1 } define float @select_f32(i1 zeroext %c, float %a, float %b) { -; CHECK-LABEL: select_f32 -; CHECK: {{cmp w0, #0|tst w0, #0x1}} -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, ne -; GISEL-LABEL: select_f32 -; GISEL: {{cmp w0, #0|tst w0, #0x1}} -; GISEL-NEXT: fcsel {{s[0-9]+}}, s0, s1, ne +; GISEL-LABEL: select_f32: +; GISEL: ; %bb.0: +; GISEL-NEXT: tst w0, #0x1 +; GISEL-NEXT: fcsel s0, s0, s1, ne +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_f32: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: tst w0, #0x1 +; CHECK-FASTISEL-NEXT: fcsel s0, s0, s1, ne +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_f32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: tst w0, #0x1 +; CHECK-GISEL-NEXT: fcsel s0, s0, s1, ne +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, float %a, float %b ret float %1 } define double @select_f64(i1 zeroext %c, double %a, double %b) { -; CHECK-LABEL: select_f64 -; CHECK: {{cmp w0, #0|tst w0, #0x1}} -; CHECK-NEXT: fcsel {{d[0-9]+}}, d0, d1, ne -; GISEL-LABEL: select_f64 -; GISEL: {{cmp w0, #0|tst w0, #0x1}} -; GISEL-NEXT: fcsel {{d[0-9]+}}, d0, d1, ne +; GISEL-LABEL: select_f64: +; GISEL: ; %bb.0: +; GISEL-NEXT: tst w0, #0x1 +; GISEL-NEXT: fcsel d0, d0, d1, ne +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_f64: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: tst w0, #0x1 +; CHECK-FASTISEL-NEXT: fcsel d0, d0, d1, ne +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: tst w0, #0x1 +; CHECK-GISEL-NEXT: fcsel d0, d0, d1, ne +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, double %a, double %b ret double %1 } ; Now test the folding of all compares. define float @select_fcmp_false(float %x, float %a, float %b) { -; CHECK-LABEL: select_fcmp_false -; CHECK: fmov {{s[0-9]+}}, s2 +; CHECK-FASTISEL-LABEL: select_fcmp_false: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: fmov s0, s2 +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_fcmp_false: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: fcmp s0, s0 +; CHECK-GISEL-NEXT: fcsel s0, s1, s2, gt +; CHECK-GISEL-NEXT: ret +; GISEL-LABEL: select_fcmp_false: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s0 +; GISEL-NEXT: fcsel s0, s1, s2, gt +; GISEL-NEXT: ret %1 = fcmp ogt float %x, %x %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_ogt(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_ogt -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, gt +; CHECK-LABEL: select_fcmp_ogt: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, gt +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_ogt: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, gt +; GISEL-NEXT: ret %1 = fcmp ogt float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_oge(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_oge -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, ge +; CHECK-LABEL: select_fcmp_oge: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, ge +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_oge: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, ge +; GISEL-NEXT: ret %1 = fcmp oge float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_olt(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_olt -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, mi +; CHECK-LABEL: select_fcmp_olt: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, mi +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_olt: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, mi +; GISEL-NEXT: ret %1 = fcmp olt float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_ole(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_ole -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, ls +; CHECK-LABEL: select_fcmp_ole: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, ls +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_ole: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, ls +; GISEL-NEXT: ret %1 = fcmp ole float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_one(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_one -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel [[REG:s[0-9]+]], s2, s3, mi -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, [[REG]], gt +; CHECK-FASTISEL-LABEL: select_fcmp_one: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: fcmp s0, s1 +; CHECK-FASTISEL-NEXT: fcsel s0, s2, s3, mi +; CHECK-FASTISEL-NEXT: fcsel s0, s2, s0, gt +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_fcmp_one: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: fcmp s0, s1 +; CHECK-GISEL-NEXT: cset w8, mi +; CHECK-GISEL-NEXT: cset w9, gt +; CHECK-GISEL-NEXT: orr w8, w8, w9 +; CHECK-GISEL-NEXT: tst w8, #0x1 +; CHECK-GISEL-NEXT: fcsel s0, s2, s3, ne +; CHECK-GISEL-NEXT: ret +; GISEL-LABEL: select_fcmp_one: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: cset w8, mi +; GISEL-NEXT: cset w9, gt +; GISEL-NEXT: orr w8, w8, w9 +; GISEL-NEXT: tst w8, #0x1 +; GISEL-NEXT: fcsel s0, s2, s3, ne +; GISEL-NEXT: ret %1 = fcmp one float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_ord(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_ord -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, vc +; CHECK-LABEL: select_fcmp_ord: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, vc +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_ord: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, vc +; GISEL-NEXT: ret %1 = fcmp ord float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_uno(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_uno -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, vs +; CHECK-LABEL: select_fcmp_uno: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, vs +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_uno: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, vs +; GISEL-NEXT: ret %1 = fcmp uno float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_ueq(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_ueq -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel [[REG:s[0-9]+]], s2, s3, eq -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, [[REG]], vs +; CHECK-FASTISEL-LABEL: select_fcmp_ueq: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: fcmp s0, s1 +; CHECK-FASTISEL-NEXT: fcsel s0, s2, s3, eq +; CHECK-FASTISEL-NEXT: fcsel s0, s2, s0, vs +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_fcmp_ueq: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: fcmp s0, s1 +; CHECK-GISEL-NEXT: cset w8, eq +; CHECK-GISEL-NEXT: cset w9, vs +; CHECK-GISEL-NEXT: orr w8, w8, w9 +; CHECK-GISEL-NEXT: tst w8, #0x1 +; CHECK-GISEL-NEXT: fcsel s0, s2, s3, ne +; CHECK-GISEL-NEXT: ret +; GISEL-LABEL: select_fcmp_ueq: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: cset w8, eq +; GISEL-NEXT: cset w9, vs +; GISEL-NEXT: orr w8, w8, w9 +; GISEL-NEXT: tst w8, #0x1 +; GISEL-NEXT: fcsel s0, s2, s3, ne +; GISEL-NEXT: ret %1 = fcmp ueq float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_ugt(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_ugt -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, hi +; CHECK-LABEL: select_fcmp_ugt: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, hi +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_ugt: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, hi +; GISEL-NEXT: ret %1 = fcmp ugt float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_uge(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_uge -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, pl +; CHECK-LABEL: select_fcmp_uge: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, pl +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_uge: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, pl +; GISEL-NEXT: ret %1 = fcmp uge float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_ult(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_ult -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, lt +; CHECK-LABEL: select_fcmp_ult: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, lt +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_ult: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, lt +; GISEL-NEXT: ret %1 = fcmp ult float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 @@ -177,116 +384,221 @@ define float @select_fcmp_ult(float %x, float %y, float %a, float %b) { define float @select_fcmp_ule(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_ule -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, le +; CHECK-LABEL: select_fcmp_ule: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, le +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_ule: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, le +; GISEL-NEXT: ret %1 = fcmp ule float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_une(float %x, float %y, float %a, float %b) { -; CHECK-LABEL: select_fcmp_une -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s2, s3, ne +; CHECK-LABEL: select_fcmp_une: +; CHECK: ; %bb.0: +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, ne +; CHECK-NEXT: ret +; GISEL-LABEL: select_fcmp_une: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s1 +; GISEL-NEXT: fcsel s0, s2, s3, ne +; GISEL-NEXT: ret %1 = fcmp une float %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_fcmp_true(float %x, float %a, float %b) { -; CHECK-LABEL: select_fcmp_true -; CHECK: fmov {{s[0-9]+}}, s1 +; CHECK-FASTISEL-LABEL: select_fcmp_true: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: fmov s0, s1 +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_fcmp_true: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: fcmp s0, s0 +; CHECK-GISEL-NEXT: cset w8, eq +; CHECK-GISEL-NEXT: cset w9, vs +; CHECK-GISEL-NEXT: orr w8, w8, w9 +; CHECK-GISEL-NEXT: tst w8, #0x1 +; CHECK-GISEL-NEXT: fcsel s0, s1, s2, ne +; CHECK-GISEL-NEXT: ret +; GISEL-LABEL: select_fcmp_true: +; GISEL: ; %bb.0: +; GISEL-NEXT: fcmp s0, s0 +; GISEL-NEXT: cset w8, eq +; GISEL-NEXT: cset w9, vs +; GISEL-NEXT: orr w8, w8, w9 +; GISEL-NEXT: tst w8, #0x1 +; GISEL-NEXT: fcsel s0, s1, s2, ne +; GISEL-NEXT: ret %1 = fcmp ueq float %x, %x %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_eq(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_eq -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, eq +; CHECK-LABEL: select_icmp_eq: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, eq +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_eq: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, eq +; GISEL-NEXT: ret %1 = icmp eq i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_ne(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_ne -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, ne +; CHECK-LABEL: select_icmp_ne: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, ne +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_ne: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, ne +; GISEL-NEXT: ret %1 = icmp ne i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_ugt(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_ugt -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, hi +; CHECK-LABEL: select_icmp_ugt: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, hi +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_ugt: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, hi +; GISEL-NEXT: ret %1 = icmp ugt i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_uge(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_uge -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, hs +; CHECK-LABEL: select_icmp_uge: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, hs +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_uge: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, hs +; GISEL-NEXT: ret %1 = icmp uge i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_ult(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_ult -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, lo +; CHECK-LABEL: select_icmp_ult: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, lo +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_ult: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, lo +; GISEL-NEXT: ret %1 = icmp ult i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_ule(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_ule -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, ls +; CHECK-LABEL: select_icmp_ule: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, ls +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_ule: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, ls +; GISEL-NEXT: ret %1 = icmp ule i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_sgt(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_sgt -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, gt +; CHECK-LABEL: select_icmp_sgt: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, gt +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_sgt: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, gt +; GISEL-NEXT: ret %1 = icmp sgt i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_sge(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_sge -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, ge +; CHECK-LABEL: select_icmp_sge: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, ge +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_sge: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, ge +; GISEL-NEXT: ret %1 = icmp sge i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_slt(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_slt -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, lt +; CHECK-LABEL: select_icmp_slt: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, lt +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_slt: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, lt +; GISEL-NEXT: ret %1 = icmp slt i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 } define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) { -; CHECK-LABEL: select_icmp_sle -; CHECK: cmp w0, w1 -; CHECK-NEXT: fcsel {{s[0-9]+}}, s0, s1, le +; CHECK-LABEL: select_icmp_sle: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: fcsel s0, s0, s1, le +; CHECK-NEXT: ret +; GISEL-LABEL: select_icmp_sle: +; GISEL: ; %bb.0: +; GISEL-NEXT: cmp w0, w1 +; GISEL-NEXT: fcsel s0, s0, s1, le +; GISEL-NEXT: ret %1 = icmp sle i32 %x, %y %2 = select i1 %1, float %a, float %b ret float %2 @@ -294,30 +606,86 @@ define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) { ; Test peephole optimizations for select. define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) { -; CHECK-LABEL: select_opt1 -; CHECK: orr {{w[0-9]+}}, w0, w1 +; GISEL-LABEL: select_opt1: +; GISEL: ; %bb.0: +; GISEL-NEXT: orr w8, w0, w1 +; GISEL-NEXT: and w0, w8, #0x1 +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_opt1: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: orr w8, w0, w1 +; CHECK-FASTISEL-NEXT: and w0, w8, #0x1 +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_opt1: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: orr w8, w0, w1 +; CHECK-GISEL-NEXT: and w0, w8, #0x1 +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, i1 true, i1 %a ret i1 %1 } define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) { -; CHECK-LABEL: select_opt2 -; CHECK: eor [[REG:w[0-9]+]], w0, #0x1 -; CHECK: orr {{w[0-9]+}}, [[REG]], w1 +; GISEL-LABEL: select_opt2: +; GISEL: ; %bb.0: +; GISEL-NEXT: eor w8, w0, #0x1 +; GISEL-NEXT: orr w8, w8, w1 +; GISEL-NEXT: and w0, w8, #0x1 +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_opt2: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: eor w8, w0, #0x1 +; CHECK-FASTISEL-NEXT: orr w8, w8, w1 +; CHECK-FASTISEL-NEXT: and w0, w8, #0x1 +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_opt2: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: eor w8, w0, #0x1 +; CHECK-GISEL-NEXT: orr w8, w8, w1 +; CHECK-GISEL-NEXT: and w0, w8, #0x1 +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, i1 %a, i1 true ret i1 %1 } define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) { -; CHECK-LABEL: select_opt3 -; CHECK: bic {{w[0-9]+}}, w1, w0 +; GISEL-LABEL: select_opt3: +; GISEL: ; %bb.0: +; GISEL-NEXT: eor w8, w0, #0x1 +; GISEL-NEXT: and w0, w8, w1 +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_opt3: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: bic w8, w1, w0 +; CHECK-FASTISEL-NEXT: and w0, w8, #0x1 +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_opt3: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: eor w8, w0, #0x1 +; CHECK-GISEL-NEXT: and w0, w8, w1 +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, i1 false, i1 %a ret i1 %1 } define zeroext i1 @select_opt4(i1 zeroext %c, i1 zeroext %a) { -; CHECK-LABEL: select_opt4 -; CHECK: and {{w[0-9]+}}, w0, w1 +; GISEL-LABEL: select_opt4: +; GISEL: ; %bb.0: +; GISEL-NEXT: and w0, w0, w1 +; GISEL-NEXT: ret +; CHECK-FASTISEL-LABEL: select_opt4: +; CHECK-FASTISEL: ; %bb.0: +; CHECK-FASTISEL-NEXT: and w8, w0, w1 +; CHECK-FASTISEL-NEXT: and w0, w8, #0x1 +; CHECK-FASTISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: select_opt4: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: and w0, w0, w1 +; CHECK-GISEL-NEXT: ret %1 = select i1 %c, i1 %a, i1 false ret i1 %1 } diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index b2f9bf89d9ec60..7d8eba1e870804 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1509,49 +1509,39 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v12, v1 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v12, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v10, v4 ; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v10, v3 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v10, v2 ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v12 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[6:7], v4, v12, s[6:7] -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v6, s[6:7], v3, v10, s[6:7] -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v5, s[6:7], v2, v10, s[6:7] -; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v2, s[6:7], v2, v12, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v6, s[6:7], v4, v10, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[6:7], v3, v10, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v5, v11, v5 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v8, v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v8, v11, v5 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v5, v11, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v14 ; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v9, v7 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v9, v6 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v5, s[6:7], v5, v11 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v15, s[6:7], v8, v11, s[6:7] -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v14, s[6:7], v7, v9, s[6:7] -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v13, s[6:7], v6, v9, s[6:7] -; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v15 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v13 -; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v9, v4 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v8, s[6:7], v8, v11 ; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v5, s[6:7], v5, v11, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[6:7], v7, v9, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v4, s[6:7], v4, v9, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_xor_b32_e64 v13, v11, v12 ; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v11, v12 @@ -1560,97 +1550,69 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_xor_b32_e64 v9, v9, v10 ; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 -; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v8, v7 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v5, v4 ; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 -; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v1, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v2, v3 ; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] ; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 -; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 -; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v8 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v4, v4 ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 -; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-G-O0-NEXT: v_min_u32_e64 v4, v4, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[8:9] ; GFX9-G-O0-NEXT: s_mov_b32 s16, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], v[9:10] +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v2 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 -; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v5, v7 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v3 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 -; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[8:9] ; GFX9-G-O0-NEXT: s_mov_b32 s15, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6 -; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v7, s[8:9], v4, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s16 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s16 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9] -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v4, s[8:9], v4, v5, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s15 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s14 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9] @@ -1659,8 +1621,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9] ; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 @@ -1685,35 +1647,27 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10 ; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6 -; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9 -; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8 -; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5 -; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v4, s6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v7, v9 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[7:8], v[9:10] +; GFX9-G-O0-NEXT: v_and_b32_e32 v4, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v2, v4, s[6:7] ; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5 -; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_and_b32_e32 v4, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] ; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec @@ -1883,10 +1837,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 ; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload @@ -1899,14 +1853,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 @@ -1915,7 +1869,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[14:15], v2, v[0:1] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4] ; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec @@ -1929,9 +1883,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 ; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1 -; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr23_vgpr24 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v25 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v26 ; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec ; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec ; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 @@ -1939,81 +1893,73 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 ; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v15 ; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 ; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v25 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v26 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[27:28], v0, v[2:3] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13] -; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[14:15] +; GFX9-G-O0-NEXT: ; kill: def $vgpr14 killed $vgpr2 killed $exec ; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec ; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v23, v2, v3 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v25, v33 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v26, v34 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24 -; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15 -; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v28 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v24 +; GFX9-G-O0-NEXT: v_or_b32_e64 v15, v1, v15 ; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22 -; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15 -; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13 -; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v25 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v26 +; GFX9-G-O0-NEXT: v_or3_b32 v14, v14, v23, v24 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v15 +; GFX9-G-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9] +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v13, s[8:9], v13, v4 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v12, s[8:9], v12, v9, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v7, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v12, s[8:9], v6, v5, s[8:9] ; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 -; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v6, v10 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v10, v6, v12 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 -; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v10 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v12 ; GFX9-G-O0-NEXT: s_mov_b32 s9, 1 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v8, s9 -; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, s8 +; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v10, s9 +; GFX9-G-O0-NEXT: v_and_b32_e64 v14, v10, s8 ; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, s4 ; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24 -; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11 -; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v24 +; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v10, v22 ; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 ; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21 ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 @@ -2114,66 +2060,62 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v17 +; GFX9-G-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v19, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v19 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6 -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22] -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16] +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v19, v[21:22] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v19, v[23:24] ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v26 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25 -; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23 -; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v25 +; GFX9-G-O0-NEXT: v_or_b32_e64 v20, v20, v23 +; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v5, v19 ; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v19, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v18, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[6:7] ; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v6 ; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[4:5] ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v6 ; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v18 ; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 @@ -2226,14 +2168,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 @@ -2241,48 +2183,50 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v3, v5 ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7 -; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v6, v8, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v7, v8, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v2, v7, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v7 +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v9, s[6:7], v2, v3 ; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v9, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v15, v1, v9 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v9, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14] -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14] -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v9, v1 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v9, v[13:14] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v15, v[13:14] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v9, v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 -; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15 -; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 +; GFX9-G-O0-NEXT: v_or_b32_e64 v12, v12, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v9, v11 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2 @@ -2294,10 +2238,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v12, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[8:9] ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] ; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec From c9c244423ffb8071bb838c3606052e12af537047 Mon Sep 17 00:00:00 2001 From: Angel Zhang Date: Wed, 29 May 2024 12:19:32 -0400 Subject: [PATCH 144/230] [mlir][spirv] Add integration test for `vector.interleave` and `vector.shuffle` (#93595) - Add integration test for `vector.shuffle` and `vector.interleave`, mentioned in issue #91978 - Add `VectorToSPIRV` patterns to `GPUToSPIRVPass` --------- Co-authored-by: Jakub Kuderski --- .../Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp | 2 + .../mlir-vulkan-runner/vector-interleave.mlir | 53 +++++++++++++++++++ .../mlir-vulkan-runner/vector-shuffle.mlir | 53 +++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 mlir/test/mlir-vulkan-runner/vector-interleave.mlir create mode 100644 mlir/test/mlir-vulkan-runner/vector-shuffle.mlir diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp index 1d1db913e3df23..53e73ec0d81bf0 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp @@ -18,6 +18,7 @@ #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRV.h" #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRV.h" #include "mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h" +#include "mlir/Conversion/VectorToSPIRV/VectorToSPIRV.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" @@ -132,6 +133,7 @@ void GPUToSPIRVPass::runOnOperation() { mlir::arith::populateArithToSPIRVPatterns(typeConverter, patterns); populateMemRefToSPIRVPatterns(typeConverter, patterns); populateFuncToSPIRVPatterns(typeConverter, patterns); + populateVectorToSPIRVPatterns(typeConverter, patterns); if (failed(applyFullConversion(gpuModule, *target, std::move(patterns)))) return signalPassFailure(); diff --git a/mlir/test/mlir-vulkan-runner/vector-interleave.mlir b/mlir/test/mlir-vulkan-runner/vector-interleave.mlir new file mode 100644 index 00000000000000..2f5c319e2f5c5d --- /dev/null +++ b/mlir/test/mlir-vulkan-runner/vector-interleave.mlir @@ -0,0 +1,53 @@ +// RUN: mlir-vulkan-runner %s \ +// RUN: --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \ +// RUN: --entry-point-result=void | FileCheck %s + +// CHECK: [0, 2, 1, 3] +module attributes { + gpu.container_module, + spirv.target_env = #spirv.target_env< + #spirv.vce, #spirv.resource_limits<>> +} { + gpu.module @kernels { + gpu.func @kernel_vector_interleave(%arg0 : memref<2xi32>, %arg1 : memref<2xi32>, %arg2 : memref<4xi32>) + kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi} { + %c0 = arith.constant 0 : index + %vec0 = vector.load %arg0[%c0] : memref<2xi32>, vector<2xi32> + %vec1 = vector.load %arg1[%c0] : memref<2xi32>, vector<2xi32> + %result = vector.interleave %vec0, %vec1 : vector<2xi32> -> vector<4xi32> + vector.store %result, %arg2[%c0] : memref<4xi32>, vector<4xi32> + gpu.return + } + } + + func.func @main() { + // Allocate 3 buffers. + %buf0 = memref.alloc() : memref<2xi32> + %buf1 = memref.alloc() : memref<2xi32> + %buf2 = memref.alloc() : memref<4xi32> + + %idx0 = arith.constant 0 : index + %idx1 = arith.constant 1 : index + %idx4 = arith.constant 4 : index + + // Initialize input buffer. + %buf0_vals = arith.constant dense<[0, 1]> : vector<2xi32> + %buf1_vals = arith.constant dense<[2, 3]> : vector<2xi32> + vector.store %buf0_vals, %buf0[%idx0] : memref<2xi32>, vector<2xi32> + vector.store %buf1_vals, %buf1[%idx0] : memref<2xi32>, vector<2xi32> + + // Initialize output buffer. + %value0 = arith.constant 0 : i32 + %buf3 = memref.cast %buf2 : memref<4xi32> to memref + call @fillResource1DInt(%buf3, %value0) : (memref, i32) -> () + + gpu.launch_func @kernels::@kernel_vector_interleave + blocks in (%idx4, %idx1, %idx1) threads in (%idx1, %idx1, %idx1) + args(%buf0 : memref<2xi32>, %buf1 : memref<2xi32>, %buf2 : memref<4xi32>) + %buf4 = memref.cast %buf3 : memref to memref<*xi32> + call @printMemrefI32(%buf4) : (memref<*xi32>) -> () + return + } + func.func private @fillResource1DInt(%0 : memref, %1 : i32) + func.func private @printMemrefI32(%ptr : memref<*xi32>) +} diff --git a/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir b/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir new file mode 100644 index 00000000000000..e29e054ccd46be --- /dev/null +++ b/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir @@ -0,0 +1,53 @@ +// RUN: mlir-vulkan-runner %s \ +// RUN: --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \ +// RUN: --entry-point-result=void | FileCheck %s + +// CHECK: [2, 1, 3] +module attributes { + gpu.container_module, + spirv.target_env = #spirv.target_env< + #spirv.vce, #spirv.resource_limits<>> +} { + gpu.module @kernels { + gpu.func @kernel_vector_shuffle(%arg0 : memref<2xi32>, %arg1 : memref<2xi32>, %arg2 : memref<3xi32>) + kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi} { + %c0 = arith.constant 0 : index + %vec0 = vector.load %arg0[%c0] : memref<2xi32>, vector<2xi32> + %vec1 = vector.load %arg1[%c0] : memref<2xi32>, vector<2xi32> + %result = vector.shuffle %vec0, %vec1[2, 1, 3] : vector<2xi32>, vector<2xi32> + vector.store %result, %arg2[%c0] : memref<3xi32>, vector<3xi32> + gpu.return + } + } + + func.func @main() { + // Allocate 3 buffers. + %buf0 = memref.alloc() : memref<2xi32> + %buf1 = memref.alloc() : memref<2xi32> + %buf2 = memref.alloc() : memref<3xi32> + + %idx0 = arith.constant 0 : index + %idx1 = arith.constant 1 : index + %idx4 = arith.constant 4 : index + + // Initialize input buffer + %buf0_vals = arith.constant dense<[0, 1]> : vector<2xi32> + %buf1_vals = arith.constant dense<[2, 3]> : vector<2xi32> + vector.store %buf0_vals, %buf0[%idx0] : memref<2xi32>, vector<2xi32> + vector.store %buf1_vals, %buf1[%idx0] : memref<2xi32>, vector<2xi32> + + // Initialize output buffer. + %value0 = arith.constant 0 : i32 + %buf3 = memref.cast %buf2 : memref<3xi32> to memref + call @fillResource1DInt(%buf3, %value0) : (memref, i32) -> () + + gpu.launch_func @kernels::@kernel_vector_shuffle + blocks in (%idx4, %idx1, %idx1) threads in (%idx1, %idx1, %idx1) + args(%buf0 : memref<2xi32>, %buf1 : memref<2xi32>, %buf2 : memref<3xi32>) + %buf4 = memref.cast %buf3 : memref to memref<*xi32> + call @printMemrefI32(%buf4) : (memref<*xi32>) -> () + return + } + func.func private @fillResource1DInt(%0 : memref, %1 : i32) + func.func private @printMemrefI32(%ptr : memref<*xi32>) +} From cfb209b92a26f16ed7413b32da20fc436eff8c58 Mon Sep 17 00:00:00 2001 From: Vy Nguyen Date: Wed, 29 May 2024 12:22:42 -0400 Subject: [PATCH 145/230] [lldb][lldb-dap] Cleanup breakpoint filters. (#87550) Details: - remove Swift breakpoint filter because this version of LLDB does not support Swift. - only return objc filters when working on macos. --- lldb/include/lldb/API/SBDebugger.h | 2 ++ lldb/include/lldb/Symbol/TypeSystem.h | 1 + lldb/source/API/SBDebugger.cpp | 4 +++ lldb/source/Symbol/TypeSystem.cpp | 11 ++++++++ lldb/tools/lldb-dap/DAP.cpp | 39 ++++++++++++++++++++------- lldb/tools/lldb-dap/DAP.h | 4 ++- lldb/tools/lldb-dap/lldb-dap.cpp | 6 +++-- 7 files changed, 54 insertions(+), 13 deletions(-) diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h index af19b1faf3bf51..84ea9c0f772e16 100644 --- a/lldb/include/lldb/API/SBDebugger.h +++ b/lldb/include/lldb/API/SBDebugger.h @@ -57,6 +57,8 @@ class LLDB_API SBDebugger { static const char *GetBroadcasterClass(); + static bool SupportsLanguage(lldb::LanguageType language); + lldb::SBBroadcaster GetBroadcaster(); /// Get progress data from a SBEvent whose type is eBroadcastBitProgress. diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h index b4025c173a1861..7d48f9b316138c 100644 --- a/lldb/include/lldb/Symbol/TypeSystem.h +++ b/lldb/include/lldb/Symbol/TypeSystem.h @@ -209,6 +209,7 @@ class TypeSystem : public PluginInterface, // TypeSystems can support more than one language virtual bool SupportsLanguage(lldb::LanguageType language) = 0; + static bool SupportsLanguageStatic(lldb::LanguageType language); // Type Completion virtual bool GetCompleteType(lldb::opaque_compiler_type_t type) = 0; diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index 7ef0d6efd4aaa5..29da7d33dd80b8 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -1742,3 +1742,7 @@ bool SBDebugger::InterruptRequested() { return m_opaque_sp->InterruptRequested(); return false; } + +bool SBDebugger::SupportsLanguage(lldb::LanguageType language) { + return TypeSystem::SupportsLanguageStatic(language); +} diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp index 4956f10a0b0a73..5d56d9b1829dac 100644 --- a/lldb/source/Symbol/TypeSystem.cpp +++ b/lldb/source/Symbol/TypeSystem.cpp @@ -335,3 +335,14 @@ TypeSystemMap::GetTypeSystemForLanguage(lldb::LanguageType language, } return GetTypeSystemForLanguage(language); } + +bool TypeSystem::SupportsLanguageStatic(lldb::LanguageType language) { + if (language == eLanguageTypeUnknown) + return false; + + LanguageSet languages = + PluginManager::GetAllTypeSystemSupportedLanguagesForTypes(); + if (languages.Empty()) + return false; + return languages[language]; +} diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index d419f821999e6c..807d27c2c869d9 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -32,14 +32,7 @@ namespace lldb_dap { DAP g_dap; DAP::DAP() - : broadcaster("lldb-dap"), - exception_breakpoints( - {{"cpp_catch", "C++ Catch", lldb::eLanguageTypeC_plus_plus}, - {"cpp_throw", "C++ Throw", lldb::eLanguageTypeC_plus_plus}, - {"objc_catch", "Objective-C Catch", lldb::eLanguageTypeObjC}, - {"objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC}, - {"swift_catch", "Swift Catch", lldb::eLanguageTypeSwift}, - {"swift_throw", "Swift Throw", lldb::eLanguageTypeSwift}}), + : broadcaster("lldb-dap"), exception_breakpoints(), focus_tid(LLDB_INVALID_THREAD_ID), stop_at_entry(false), is_attach(false), enable_auto_variable_summaries(false), enable_synthetic_child_debugging(false), @@ -65,8 +58,32 @@ DAP::DAP() DAP::~DAP() = default; +void DAP::PopulateExceptionBreakpoints() { + exception_breakpoints = {}; + if (debugger.SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) { + exception_breakpoints->emplace_back("cpp_catch", "C++ Catch", + lldb::eLanguageTypeC_plus_plus); + exception_breakpoints->emplace_back("cpp_throw", "C++ Throw", + lldb::eLanguageTypeC_plus_plus); + } + if (debugger.SupportsLanguage(lldb::eLanguageTypeObjC)) { + exception_breakpoints->emplace_back("objc_catch", "Objective-C Catch", + lldb::eLanguageTypeObjC); + exception_breakpoints->emplace_back("objc_throw", "Objective-C Throw", + lldb::eLanguageTypeObjC); + } + if (debugger.SupportsLanguage(lldb::eLanguageTypeSwift)) { + exception_breakpoints->emplace_back("swift_catch", "Swift Catch", + lldb::eLanguageTypeSwift); + exception_breakpoints->emplace_back("swift_throw", "Swift Throw", + lldb::eLanguageTypeSwift); + } +} + ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) { - for (auto &bp : exception_breakpoints) { + assert(exception_breakpoints.has_value() && + "PopulateExceptionBreakpoints must be called first"); + for (auto &bp : *exception_breakpoints) { if (bp.filter == filter) return &bp; } @@ -74,7 +91,9 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) { } ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) { - for (auto &bp : exception_breakpoints) { + assert(exception_breakpoints.has_value() && + "PopulateExceptionBreakpoints must be called first"); + for (auto &bp : *exception_breakpoints) { if (bp.bp.GetID() == bp_id) return &bp; } diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index a88ee3e1dec6bc..d114b886a15970 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -156,7 +156,7 @@ struct DAP { std::unique_ptr log; llvm::StringMap source_breakpoints; FunctionBreakpointMap function_breakpoints; - std::vector exception_breakpoints; + std::optional> exception_breakpoints; std::vector init_commands; std::vector pre_run_commands; std::vector post_run_commands; @@ -228,6 +228,8 @@ struct DAP { llvm::json::Value CreateTopLevelScopes(); + void PopulateExceptionBreakpoints(); + /// \return /// Attempt to determine if an expression is a variable expression or /// lldb command using a hueristic based on the first term of the diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index 7746afb6cbbf38..470c9f84c6a203 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #if defined(_WIN32) @@ -1586,6 +1587,7 @@ void request_initialize(const llvm::json::Object &request) { bool source_init_file = GetBoolean(arguments, "sourceInitFile", true); g_dap.debugger = lldb::SBDebugger::Create(source_init_file, log_cb, nullptr); + g_dap.PopulateExceptionBreakpoints(); auto cmd = g_dap.debugger.GetCommandInterpreter().AddMultiwordCommand( "lldb-dap", "Commands for managing lldb-dap."); if (GetBoolean(arguments, "supportsStartDebuggingRequest", false)) { @@ -1621,7 +1623,7 @@ void request_initialize(const llvm::json::Object &request) { body.try_emplace("supportsEvaluateForHovers", true); // Available filters or options for the setExceptionBreakpoints request. llvm::json::Array filters; - for (const auto &exc_bp : g_dap.exception_breakpoints) { + for (const auto &exc_bp : *g_dap.exception_breakpoints) { filters.emplace_back(CreateExceptionBreakpointFilter(exc_bp)); } body.try_emplace("exceptionBreakpointFilters", std::move(filters)); @@ -2476,7 +2478,7 @@ void request_setExceptionBreakpoints(const llvm::json::Object &request) { // Keep a list of any exception breakpoint filter names that weren't set // so we can clear any exception breakpoints if needed. std::set unset_filters; - for (const auto &bp : g_dap.exception_breakpoints) + for (const auto &bp : *g_dap.exception_breakpoints) unset_filters.insert(bp.filter); for (const auto &value : *filters) { From 2ceec68e1630b40a37448c44fea63f9114848235 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 29 May 2024 09:24:16 -0700 Subject: [PATCH 146/230] [ValueTypes] Rename FlagVT to Glue in ValueTypes.td. NFC Nothing ever refers to it as FlagVT so we can just use the LLVMName "Glue". --- llvm/include/llvm/CodeGen/ValueTypes.td | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index a6981b0ffa13c2..963b6a71de3801 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -274,9 +274,7 @@ def nxv4f64 : VTScalableVec<4, f64, 188>; // n x 4 x f64 vector value def nxv8f64 : VTScalableVec<8, f64, 189>; // n x 8 x f64 vector value def x86mmx : ValueType<64, 190>; // X86 MMX value -def FlagVT : ValueType<0, 191> { // Pre-RA sched glue - let LLVMName = "Glue"; -} +def Glue : ValueType<0, 191>; // Pre-RA sched glue def isVoid : ValueType<0, 192>; // Produces no value def untyped : ValueType<8, 193> { // Produces an untyped value let LLVMName = "Untyped"; From 949ef57dd20f8d3f3257376b91af71ab8c380338 Mon Sep 17 00:00:00 2001 From: Konstantin Zhuravlyov Date: Wed, 29 May 2024 12:52:34 -0400 Subject: [PATCH 147/230] AMDGPU/NFC: Reserve 0x058 EF_AMDGPU_MACHs (#93696) --- llvm/docs/AMDGPUUsage.rst | 1 + llvm/include/llvm/BinaryFormat/ELF.h | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index b827524e6b8db4..95b54548f4fa8a 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1972,6 +1972,7 @@ The AMDGPU backend uses the following ELF header: *reserved* 0x055 Reserved. *reserved* 0x056 Reserved. *reserved* 0x057 Reserved. + *reserved* 0x058 Reserved. ========================================== ========== ============================= Sections diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 67cacaed2e12e0..9a538252d9beff 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -798,6 +798,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X58 = 0x058, // clang-format on // First/last AMDGCN-based processors. From 4e251e7cad6c27b7476edd8e1dc4b98d5a8efe76 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 May 2024 17:57:23 +0100 Subject: [PATCH 148/230] Fix MSVC "result of 32-bit shift implicitly converted to 64 bits" warning. NFC. --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 9208b096affad9..6f0cae2edab17f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -8013,7 +8013,7 @@ LegalizerHelper::lowerBitreverse(MachineInstr &MI) { Tmp2 = MIRBuilder.buildLShr(Ty, Src, ShAmt); } - auto Mask = MIRBuilder.buildConstant(Ty, 1U << J); + auto Mask = MIRBuilder.buildConstant(Ty, 1ULL << J); Tmp2 = MIRBuilder.buildAnd(Ty, Tmp2, Mask); if (I == 0) Tmp = Tmp2; From 2665b2a6ddb1625799536c45ca15605a6f24c081 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 May 2024 18:05:41 +0100 Subject: [PATCH 149/230] [X86] Pull out combineConstantPoolLoads helper from combineLoad. NFC. The logic is already pretty dense and a future patch will further complicate this. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 137 +++++++++++++++--------- 1 file changed, 86 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2d8343ffa1a0b3..24340e135b08b9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -50823,10 +50823,83 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return SDValue(); } +static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + auto *Ld = cast(N); + EVT RegVT = Ld->getValueType(0); + EVT MemVT = Ld->getMemoryVT(); + SDValue Ptr = Ld->getBasePtr(); + SDValue Chain = Ld->getChain(); + ISD::LoadExtType Ext = Ld->getExtensionType(); + + if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple()) + return SDValue(); + + if (!(RegVT.is128BitVector() || RegVT.is256BitVector())) + return SDValue(); + + auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs, + ArrayRef Bits, ArrayRef UserBits) { + for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) { + if (Undefs[I]) + continue; + if (UserUndefs[I] || Bits[I] != UserBits[I]) + return false; + } + return true; + }; + + // Look through all other loads/broadcasts in the chain for another constant + // pool entry. + for (SDNode *User : Chain->uses()) { + auto *UserLd = dyn_cast(User); + if (User != N && UserLd && + (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD || + User->getOpcode() == X86ISD::VBROADCAST_LOAD || + ISD::isNormalLoad(User)) && + UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) && + User->getValueSizeInBits(0).getFixedValue() > + RegVT.getFixedSizeInBits()) { + EVT UserVT = User->getValueType(0); + SDValue UserPtr = UserLd->getBasePtr(); + const Constant *LdC = getTargetConstantFromBasePtr(Ptr); + const Constant *UserC = getTargetConstantFromBasePtr(UserPtr); + + // See if we are loading a constant that matches in the lower + // bits of a longer constant (but from a different constant pool ptr). + if (LdC && UserC && UserPtr != Ptr) { + unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits(); + unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits(); + if (LdSize < UserSize || !ISD::isNormalLoad(User)) { + APInt Undefs, UserUndefs; + SmallVector Bits, UserBits; + unsigned NumBits = std::min(RegVT.getScalarSizeInBits(), + UserVT.getScalarSizeInBits()); + if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs, + Bits) && + getTargetConstantBitsFromNode(SDValue(User, 0), NumBits, + UserUndefs, UserBits)) { + if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) { + SDValue Extract = extractSubVector( + SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits()); + Extract = DAG.getBitcast(RegVT, Extract); + return DCI.CombineTo(N, Extract, SDValue(User, 1)); + } + } + } + } + } + } + + return SDValue(); +} + static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - LoadSDNode *Ld = cast(N); + auto *Ld = cast(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); SDLoc dl(Ld); @@ -50885,7 +50958,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, } } - // If we also load/broadcast this to a wider type, then just extract the + // If we also broadcast this vector to a wider type, then just extract the // lowest subvector. if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() && (RegVT.is128BitVector() || RegVT.is256BitVector())) { @@ -50894,61 +50967,23 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, for (SDNode *User : Chain->uses()) { auto *UserLd = dyn_cast(User); if (User != N && UserLd && - (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD || - User->getOpcode() == X86ISD::VBROADCAST_LOAD || - ISD::isNormalLoad(User)) && - UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) && + User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && + UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr && + UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() && + !User->hasAnyUseOfValue(1) && User->getValueSizeInBits(0).getFixedValue() > RegVT.getFixedSizeInBits()) { - if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && - UserLd->getBasePtr() == Ptr && - UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) { - SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), - RegVT.getSizeInBits()); - Extract = DAG.getBitcast(RegVT, Extract); - return DCI.CombineTo(N, Extract, SDValue(User, 1)); - } - auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs, - ArrayRef Bits, ArrayRef UserBits) { - for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) { - if (Undefs[I]) - continue; - if (UserUndefs[I] || Bits[I] != UserBits[I]) - return false; - } - return true; - }; - // See if we are loading a constant that matches in the lower - // bits of a longer constant (but from a different constant pool ptr). - EVT UserVT = User->getValueType(0); - SDValue UserPtr = UserLd->getBasePtr(); - const Constant *LdC = getTargetConstantFromBasePtr(Ptr); - const Constant *UserC = getTargetConstantFromBasePtr(UserPtr); - if (LdC && UserC && UserPtr != Ptr) { - unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits(); - unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits(); - if (LdSize < UserSize || !ISD::isNormalLoad(User)) { - APInt Undefs, UserUndefs; - SmallVector Bits, UserBits; - unsigned NumBits = std::min(RegVT.getScalarSizeInBits(), - UserVT.getScalarSizeInBits()); - if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs, - Bits) && - getTargetConstantBitsFromNode(SDValue(User, 0), NumBits, - UserUndefs, UserBits)) { - if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) { - SDValue Extract = extractSubVector( - SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits()); - Extract = DAG.getBitcast(RegVT, Extract); - return DCI.CombineTo(N, Extract, SDValue(User, 1)); - } - } - } - } + SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl, + RegVT.getSizeInBits()); + Extract = DAG.getBitcast(RegVT, Extract); + return DCI.CombineTo(N, Extract, SDValue(User, 1)); } } } + if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget)) + return V; + // Cast ptr32 and ptr64 pointers to the default address space before a load. unsigned AddrSpace = Ld->getAddressSpace(); if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR || From 737a3018e826f5452f181a550be90b9135d8eda5 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Wed, 29 May 2024 10:15:17 -0700 Subject: [PATCH 150/230] [nfc][InstrFDO] Add Header::getIndexedProfileVersion and use it to decide profile version. (#93613) This is a split of https://github.com/llvm/llvm-project/pull/93346 as discussed. --- llvm/include/llvm/ProfileData/InstrProf.h | 4 ++++ llvm/lib/ProfileData/InstrProf.cpp | 11 ++++++++--- llvm/lib/ProfileData/InstrProfReader.cpp | 8 ++++---- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 2cee928b210e2e..15b9eb688e27e5 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -1211,6 +1211,10 @@ struct Header { // Returns the size of the header in bytes for all valid fields based on the // version. I.e a older version header will return a smaller size. size_t size() const; + + // Return the indexed profile version, i.e., the least significant 32 bits + // in Header.Version. + uint64_t getIndexedProfileVersion() const; }; // Profile summary data recorded in the profile data file in indexed diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index f9cd71b37002fe..dcf6aac8b59968 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -1656,10 +1656,11 @@ Expected
Header::readFromBuffer(const unsigned char *Buffer) { // Read the version. H.Version = read(Buffer, offsetOf(&Header::Version)); - if (GET_VERSION(H.Version) > IndexedInstrProf::ProfVersion::CurrentVersion) + if (H.getIndexedProfileVersion() > + IndexedInstrProf::ProfVersion::CurrentVersion) return make_error(instrprof_error::unsupported_version); - switch (GET_VERSION(H.Version)) { + switch (H.getIndexedProfileVersion()) { // When a new field is added in the header add a case statement here to // populate it. static_assert( @@ -1689,8 +1690,12 @@ Expected
Header::readFromBuffer(const unsigned char *Buffer) { return H; } +uint64_t Header::getIndexedProfileVersion() const { + return GET_VERSION(Version); +} + size_t Header::size() const { - switch (GET_VERSION(Version)) { + switch (getIndexedProfileVersion()) { // When a new field is added to the header add a case statement here to // compute the size as offset of the new field + size of the new field. This // relies on the field being added to the end of the list. diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 798236c295194a..a5ae0c6fa62444 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -1328,7 +1328,7 @@ Error IndexedInstrProfReader::readHeader() { // The MemProfOffset field in the header is only valid when the format // version is higher than 8 (when it was introduced). - if (GET_VERSION(Header->Version) >= 8 && + if (Header->getIndexedProfileVersion() >= 8 && Header->Version & VARIANT_MASK_MEMPROF) { if (Error E = MemProfReader.deserialize(Start, Header->MemProfOffset)) return E; @@ -1336,7 +1336,7 @@ Error IndexedInstrProfReader::readHeader() { // BinaryIdOffset field in the header is only valid when the format version // is higher than 9 (when it was introduced). - if (GET_VERSION(Header->Version) >= 9) { + if (Header->getIndexedProfileVersion() >= 9) { const unsigned char *Ptr = Start + Header->BinaryIdOffset; // Read binary ids size. BinaryIdsSize = @@ -1350,7 +1350,7 @@ Error IndexedInstrProfReader::readHeader() { "corrupted binary ids"); } - if (GET_VERSION(Header->Version) >= 12) { + if (Header->getIndexedProfileVersion() >= 12) { const unsigned char *Ptr = Start + Header->VTableNamesOffset; CompressedVTableNamesLen = @@ -1363,7 +1363,7 @@ Error IndexedInstrProfReader::readHeader() { return make_error(instrprof_error::truncated); } - if (GET_VERSION(Header->Version) >= 10 && + if (Header->getIndexedProfileVersion() >= 10 && Header->Version & VARIANT_MASK_TEMPORAL_PROF) { const unsigned char *Ptr = Start + Header->TemporalProfTracesOffset; const auto *PtrEnd = (const unsigned char *)DataBuffer->getBufferEnd(); From 8c5a7a1fc4890fcae50f8e8a61d5a2e2b1ebd7e5 Mon Sep 17 00:00:00 2001 From: Vadim D <36827317+vvd170501@users.noreply.github.com> Date: Wed, 29 May 2024 20:29:57 +0300 Subject: [PATCH 151/230] [clangd] Add config option to allow detection of unused angled includes (#87208) This PR adds a new `AnalyzeAngledIncludes` option to `Includes` section of clangd config. This option enables unused include checks for all includes that use the `<>` syntax, not just standard library includes. --- clang-tools-extra/clangd/Config.h | 5 +- clang-tools-extra/clangd/ConfigCompile.cpp | 60 ++++++++++++------- clang-tools-extra/clangd/ConfigFragment.h | 4 ++ clang-tools-extra/clangd/ConfigYAML.cpp | 4 ++ clang-tools-extra/clangd/IncludeCleaner.cpp | 32 ++++++---- clang-tools-extra/clangd/IncludeCleaner.h | 4 +- clang-tools-extra/clangd/ParsedAST.cpp | 3 +- .../clangd/unittests/ConfigCompileTests.cpp | 6 ++ .../clangd/unittests/ConfigYAMLTests.cpp | 15 +++++ .../clangd/unittests/IncludeCleanerTests.cpp | 44 ++++++++++++++ clang-tools-extra/docs/ReleaseNotes.rst | 5 ++ 11 files changed, 144 insertions(+), 38 deletions(-) diff --git a/clang-tools-extra/clangd/Config.h b/clang-tools-extra/clangd/Config.h index 4371c80a6c5877..41143b9ebc8d27 100644 --- a/clang-tools-extra/clangd/Config.h +++ b/clang-tools-extra/clangd/Config.h @@ -110,10 +110,11 @@ struct Config { IncludesPolicy UnusedIncludes = IncludesPolicy::Strict; IncludesPolicy MissingIncludes = IncludesPolicy::None; - /// IncludeCleaner will not diagnose usages of these headers matched by - /// these regexes. struct { + /// IncludeCleaner will not diagnose usages of these headers matched by + /// these regexes. std::vector> IgnoreHeader; + bool AnalyzeAngledIncludes = false; } Includes; } Diagnostics; diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp index 5bb2eb4a9f803f..f32f674443ffeb 100644 --- a/clang-tools-extra/clangd/ConfigCompile.cpp +++ b/clang-tools-extra/clangd/ConfigCompile.cpp @@ -572,32 +572,46 @@ struct FragmentCompiler { #else static llvm::Regex::RegexFlags Flags = llvm::Regex::NoFlags; #endif - auto Filters = std::make_shared>(); - for (auto &HeaderPattern : F.IgnoreHeader) { - // Anchor on the right. - std::string AnchoredPattern = "(" + *HeaderPattern + ")$"; - llvm::Regex CompiledRegex(AnchoredPattern, Flags); - std::string RegexError; - if (!CompiledRegex.isValid(RegexError)) { - diag(Warning, - llvm::formatv("Invalid regular expression '{0}': {1}", - *HeaderPattern, RegexError) - .str(), - HeaderPattern.Range); - continue; + std::shared_ptr> Filters; + if (!F.IgnoreHeader.empty()) { + Filters = std::make_shared>(); + for (auto &HeaderPattern : F.IgnoreHeader) { + // Anchor on the right. + std::string AnchoredPattern = "(" + *HeaderPattern + ")$"; + llvm::Regex CompiledRegex(AnchoredPattern, Flags); + std::string RegexError; + if (!CompiledRegex.isValid(RegexError)) { + diag(Warning, + llvm::formatv("Invalid regular expression '{0}': {1}", + *HeaderPattern, RegexError) + .str(), + HeaderPattern.Range); + continue; + } + Filters->push_back(std::move(CompiledRegex)); } - Filters->push_back(std::move(CompiledRegex)); } - if (Filters->empty()) + // Optional to override the resulting AnalyzeAngledIncludes + // only if it's explicitly set in the current fragment. + // Otherwise it's inherited from parent fragment. + std::optional AnalyzeAngledIncludes; + if (F.AnalyzeAngledIncludes.has_value()) + AnalyzeAngledIncludes = **F.AnalyzeAngledIncludes; + if (!Filters && !AnalyzeAngledIncludes.has_value()) return; - auto Filter = [Filters](llvm::StringRef Path) { - for (auto &Regex : *Filters) - if (Regex.match(Path)) - return true; - return false; - }; - Out.Apply.push_back([Filter](const Params &, Config &C) { - C.Diagnostics.Includes.IgnoreHeader.emplace_back(Filter); + Out.Apply.push_back([Filters = std::move(Filters), + AnalyzeAngledIncludes](const Params &, Config &C) { + if (Filters) { + auto Filter = [Filters](llvm::StringRef Path) { + for (auto &Regex : *Filters) + if (Regex.match(Path)) + return true; + return false; + }; + C.Diagnostics.Includes.IgnoreHeader.emplace_back(std::move(Filter)); + } + if (AnalyzeAngledIncludes.has_value()) + C.Diagnostics.Includes.AnalyzeAngledIncludes = *AnalyzeAngledIncludes; }); } diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h index 7fa61108c78a05..f3e51a9b6dbc4b 100644 --- a/clang-tools-extra/clangd/ConfigFragment.h +++ b/clang-tools-extra/clangd/ConfigFragment.h @@ -254,6 +254,10 @@ struct Fragment { /// unused or missing. These can match any suffix of the header file in /// question. std::vector> IgnoreHeader; + + /// If false (default), unused system headers will be ignored. + /// Standard library headers are analyzed regardless of this option. + std::optional> AnalyzeAngledIncludes; }; IncludesBlock Includes; diff --git a/clang-tools-extra/clangd/ConfigYAML.cpp b/clang-tools-extra/clangd/ConfigYAML.cpp index ce09af819247ae..3e9b6a07d3b325 100644 --- a/clang-tools-extra/clangd/ConfigYAML.cpp +++ b/clang-tools-extra/clangd/ConfigYAML.cpp @@ -169,6 +169,10 @@ class Parser { if (auto Values = scalarValues(N)) F.IgnoreHeader = std::move(*Values); }); + Dict.handle("AnalyzeAngledIncludes", [&](Node &N) { + if (auto Value = boolValue(N, "AnalyzeAngledIncludes")) + F.AnalyzeAngledIncludes = *Value; + }); Dict.parse(N); } diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp index 8e48f546d94e77..01b47679790f1d 100644 --- a/clang-tools-extra/clangd/IncludeCleaner.cpp +++ b/clang-tools-extra/clangd/IncludeCleaner.cpp @@ -68,24 +68,30 @@ bool isIgnored(llvm::StringRef HeaderPath, HeaderFilter IgnoreHeaders) { } bool mayConsiderUnused(const Inclusion &Inc, ParsedAST &AST, - const include_cleaner::PragmaIncludes *PI) { + const include_cleaner::PragmaIncludes *PI, + bool AnalyzeAngledIncludes) { assert(Inc.HeaderID); auto HID = static_cast(*Inc.HeaderID); auto FE = AST.getSourceManager().getFileManager().getFileRef( AST.getIncludeStructure().getRealPath(HID)); assert(FE); if (FE->getDir() == AST.getPreprocessor() - .getHeaderSearchInfo() - .getModuleMap() - .getBuiltinDir()) + .getHeaderSearchInfo() + .getModuleMap() + .getBuiltinDir()) return false; if (PI && PI->shouldKeep(*FE)) return false; // FIXME(kirillbobyrev): We currently do not support the umbrella headers. // System headers are likely to be standard library headers. - // Until we have good support for umbrella headers, don't warn about them. - if (Inc.Written.front() == '<') - return tooling::stdlib::Header::named(Inc.Written).has_value(); + // Until we have good support for umbrella headers, don't warn about them + // (unless analysis is explicitly enabled). + if (Inc.Written.front() == '<') { + if (tooling::stdlib::Header::named(Inc.Written)) + return true; + if (!AnalyzeAngledIncludes) + return false; + } if (PI) { // Check if main file is the public interface for a private header. If so we // shouldn't diagnose it as unused. @@ -266,7 +272,8 @@ Fix fixAll(const Fix &RemoveAllUnused, const Fix &AddAllMissing) { std::vector getUnused(ParsedAST &AST, - const llvm::DenseSet &ReferencedFiles) { + const llvm::DenseSet &ReferencedFiles, + bool AnalyzeAngledIncludes) { trace::Span Tracer("IncludeCleaner::getUnused"); std::vector Unused; for (const Inclusion &MFI : AST.getIncludeStructure().MainFileIncludes) { @@ -275,7 +282,8 @@ getUnused(ParsedAST &AST, auto IncludeID = static_cast(*MFI.HeaderID); if (ReferencedFiles.contains(IncludeID)) continue; - if (!mayConsiderUnused(MFI, AST, &AST.getPragmaIncludes())) { + if (!mayConsiderUnused(MFI, AST, &AST.getPragmaIncludes(), + AnalyzeAngledIncludes)) { dlog("{0} was not used, but is not eligible to be diagnosed as unused", MFI.Written); continue; @@ -347,7 +355,8 @@ include_cleaner::Includes convertIncludes(const ParsedAST &AST) { return ConvertedIncludes; } -IncludeCleanerFindings computeIncludeCleanerFindings(ParsedAST &AST) { +IncludeCleanerFindings +computeIncludeCleanerFindings(ParsedAST &AST, bool AnalyzeAngledIncludes) { // Interaction is only polished for C/CPP. if (AST.getLangOpts().ObjC) return {}; @@ -432,7 +441,8 @@ IncludeCleanerFindings computeIncludeCleanerFindings(ParsedAST &AST) { MapInfo::getHashValue(RHS.Symbol); }); MissingIncludes.erase(llvm::unique(MissingIncludes), MissingIncludes.end()); - std::vector UnusedIncludes = getUnused(AST, Used); + std::vector UnusedIncludes = + getUnused(AST, Used, AnalyzeAngledIncludes); return {std::move(UnusedIncludes), std::move(MissingIncludes)}; } diff --git a/clang-tools-extra/clangd/IncludeCleaner.h b/clang-tools-extra/clangd/IncludeCleaner.h index 624e2116be7da3..a01146d14e3c17 100644 --- a/clang-tools-extra/clangd/IncludeCleaner.h +++ b/clang-tools-extra/clangd/IncludeCleaner.h @@ -53,7 +53,9 @@ struct IncludeCleanerFindings { std::vector MissingIncludes; }; -IncludeCleanerFindings computeIncludeCleanerFindings(ParsedAST &AST); +IncludeCleanerFindings +computeIncludeCleanerFindings(ParsedAST &AST, + bool AnalyzeAngledIncludes = false); using HeaderFilter = llvm::ArrayRef>; std::vector diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp index 3ff759415f7c8b..2bd1fbcad2ada0 100644 --- a/clang-tools-extra/clangd/ParsedAST.cpp +++ b/clang-tools-extra/clangd/ParsedAST.cpp @@ -373,7 +373,8 @@ std::vector getIncludeCleanerDiags(ParsedAST &AST, llvm::StringRef Code, Cfg.Diagnostics.UnusedIncludes == Config::IncludesPolicy::None; if (SuppressMissing && SuppressUnused) return {}; - auto Findings = computeIncludeCleanerFindings(AST); + auto Findings = computeIncludeCleanerFindings( + AST, Cfg.Diagnostics.Includes.AnalyzeAngledIncludes); if (SuppressMissing) Findings.MissingIncludes.clear(); if (SuppressUnused) diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp index f0ffc429c0ca90..4ecfdf0184ab40 100644 --- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp +++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp @@ -277,6 +277,12 @@ TEST_F(ConfigCompileTests, DiagnosticsIncludeCleaner) { }; EXPECT_TRUE(HeaderFilter("foo.h")); EXPECT_FALSE(HeaderFilter("bar.h")); + + Frag = {}; + EXPECT_FALSE(Conf.Diagnostics.Includes.AnalyzeAngledIncludes); + Frag.Diagnostics.Includes.AnalyzeAngledIncludes = true; + EXPECT_TRUE(compileAndApply()); + EXPECT_TRUE(Conf.Diagnostics.Includes.AnalyzeAngledIncludes); } TEST_F(ConfigCompileTests, DiagnosticSuppression) { diff --git a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp index 44a6647d4c0a81..10d67dead342c3 100644 --- a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp +++ b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp @@ -278,6 +278,21 @@ TEST(ParseYAML, IncludesIgnoreHeader) { ElementsAre(val("foo"), val("bar"))); } +TEST(ParseYAML, IncludesAnalyzeAngledIncludes) { + CapturedDiags Diags; + Annotations YAML(R"yaml( +Diagnostics: + Includes: + AnalyzeAngledIncludes: true + )yaml"); + auto Results = + Fragment::parseYAML(YAML.code(), "config.yaml", Diags.callback()); + ASSERT_THAT(Diags.Diagnostics, IsEmpty()); + ASSERT_EQ(Results.size(), 1u); + EXPECT_THAT(Results[0].Diagnostics.Includes.AnalyzeAngledIncludes, + llvm::ValueIs(val(true))); +} + TEST(ParseYAML, Style) { CapturedDiags Diags; Annotations YAML(R"yaml( diff --git a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp index 142310837bd9ce..7027232460354c 100644 --- a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp +++ b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp @@ -108,6 +108,7 @@ TEST(IncludeCleaner, GetUnusedHeaders) { #include "unguarded.h" #include "unused.h" #include + #include void foo() { a(); b(); @@ -122,6 +123,7 @@ TEST(IncludeCleaner, GetUnusedHeaders) { TU.AdditionalFiles["dir/c.h"] = guard("void c();"); TU.AdditionalFiles["unused.h"] = guard("void unused();"); TU.AdditionalFiles["dir/unused.h"] = guard("void dirUnused();"); + TU.AdditionalFiles["dir/non_system_angled_header.h"] = guard(""); TU.AdditionalFiles["system/system_header.h"] = guard(""); TU.AdditionalFiles["unguarded.h"] = ""; TU.ExtraArgs.push_back("-I" + testPath("dir")); @@ -135,6 +137,48 @@ TEST(IncludeCleaner, GetUnusedHeaders) { Pointee(writtenInclusion("\"dir/unused.h\"")))); } +TEST(IncludeCleaner, IgnoredAngledHeaders) { + // Currently the default behavior is to ignore unused angled includes + auto TU = TestTU::withCode(R"cpp( + #include + #include + #include + SystemClass x; + )cpp"); + TU.AdditionalFiles["system/system_header.h"] = guard("class SystemClass {};"); + TU.AdditionalFiles["system/system_unused.h"] = guard(""); + TU.AdditionalFiles["dir/non_system_angled_unused.h"] = guard(""); + TU.ExtraArgs = { + "-isystem" + testPath("system"), + "-I" + testPath("dir"), + }; + auto AST = TU.build(); + IncludeCleanerFindings Findings = computeIncludeCleanerFindings(AST); + EXPECT_THAT(Findings.UnusedIncludes, IsEmpty()); +} + +TEST(IncludeCleaner, UnusedAngledHeaders) { + auto TU = TestTU::withCode(R"cpp( + #include + #include + #include + SystemClass x; + )cpp"); + TU.AdditionalFiles["system/system_header.h"] = guard("class SystemClass {};"); + TU.AdditionalFiles["system/system_unused.h"] = guard(""); + TU.AdditionalFiles["dir/non_system_angled_unused.h"] = guard(""); + TU.ExtraArgs = { + "-isystem" + testPath("system"), + "-I" + testPath("dir"), + }; + auto AST = TU.build(); + IncludeCleanerFindings Findings = computeIncludeCleanerFindings(AST, true); + EXPECT_THAT(Findings.UnusedIncludes, + UnorderedElementsAre( + Pointee(writtenInclusion("")), + Pointee(writtenInclusion("")))); +} + TEST(IncludeCleaner, ComputeMissingHeaders) { Annotations MainFile(R"cpp( #include "a.h" diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 3e3195f6f68139..a5e87d26d96c38 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -84,6 +84,11 @@ Objective-C Miscellaneous ^^^^^^^^^^^^^ +- Added a boolean option `AnalyzeAngledIncludes` to `Includes` config section, + which allows to enable unused includes detection for all angled ("system") headers. + At this moment umbrella headers are not supported, so enabling this option + may result in false-positives. + Improvements to clang-doc ------------------------- From 265589785ccf043492e4e0ab88c2830eae7d3496 Mon Sep 17 00:00:00 2001 From: Miro Bucko Date: Thu, 30 May 2024 00:37:57 +0700 Subject: [PATCH 152/230] [nfc][lldb] Move FastSearch from CommandObjectMemoryFind to Process (#93688) Moving CommandObjectMemoryFind::FastSearch() to Process::FindInMemory(). Plan to expose FindInMemory as public API in SBProcess. --- lldb/include/lldb/Target/Process.h | 22 +++++++ lldb/source/Commands/CommandObjectMemory.cpp | 61 +------------------- lldb/source/Target/Process.cpp | 54 +++++++++++++++++ 3 files changed, 78 insertions(+), 59 deletions(-) diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index 637d34c29715c1..eec337c15f7edd 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -2663,6 +2663,28 @@ void PruneThreadPlans(); return m_source_file_cache; } + /// Find a pattern within a memory region. + /// + /// This function searches for a pattern represented by the provided buffer + /// within the memory range specified by the low and high addresses. It uses + /// a bad character heuristic to optimize the search process. + /// + /// \param[in] low The starting address of the memory region to be searched. + /// (inclusive) + /// + /// \param[in] high The ending address of the memory region to be searched. + /// (exclusive) + /// + /// \param[in] buf A pointer to the buffer containing the pattern to be + /// searched. + /// + /// \param[in] buffer_size The size of the buffer in bytes. + /// + /// \return The address where the pattern was found or LLDB_INVALID_ADDRESS if + /// not found. + lldb::addr_t FindInMemory(lldb::addr_t low, lldb::addr_t high, + const uint8_t *buf, size_t size); + protected: friend class Trace; diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp index b78a0492cca558..1c13484dede648 100644 --- a/lldb/source/Commands/CommandObjectMemory.cpp +++ b/lldb/source/Commands/CommandObjectMemory.cpp @@ -977,35 +977,6 @@ class CommandObjectMemoryFind : public CommandObjectParsed { Options *GetOptions() override { return &m_option_group; } protected: - class ProcessMemoryIterator { - public: - ProcessMemoryIterator(ProcessSP process_sp, lldb::addr_t base) - : m_process_sp(process_sp), m_base_addr(base) { - lldbassert(process_sp.get() != nullptr); - } - - bool IsValid() { return m_is_valid; } - - uint8_t operator[](lldb::addr_t offset) { - if (!IsValid()) - return 0; - - uint8_t retval = 0; - Status error; - if (0 == - m_process_sp->ReadMemory(m_base_addr + offset, &retval, 1, error)) { - m_is_valid = false; - return 0; - } - - return retval; - } - - private: - ProcessSP m_process_sp; - lldb::addr_t m_base_addr; - bool m_is_valid = true; - }; void DoExecute(Args &command, CommandReturnObject &result) override { // No need to check "process" for validity as eCommandRequiresProcess // ensures it is valid @@ -1106,8 +1077,8 @@ class CommandObjectMemoryFind : public CommandObjectParsed { found_location = low_addr; bool ever_found = false; while (count) { - found_location = FastSearch(found_location, high_addr, buffer.GetBytes(), - buffer.GetByteSize()); + found_location = process->FindInMemory( + found_location, high_addr, buffer.GetBytes(), buffer.GetByteSize()); if (found_location == LLDB_INVALID_ADDRESS) { if (!ever_found) { result.AppendMessage("data not found within the range.\n"); @@ -1144,34 +1115,6 @@ class CommandObjectMemoryFind : public CommandObjectParsed { result.SetStatus(lldb::eReturnStatusSuccessFinishResult); } - lldb::addr_t FastSearch(lldb::addr_t low, lldb::addr_t high, uint8_t *buffer, - size_t buffer_size) { - const size_t region_size = high - low; - - if (region_size < buffer_size) - return LLDB_INVALID_ADDRESS; - - std::vector bad_char_heuristic(256, buffer_size); - ProcessSP process_sp = m_exe_ctx.GetProcessSP(); - ProcessMemoryIterator iterator(process_sp, low); - - for (size_t idx = 0; idx < buffer_size - 1; idx++) { - decltype(bad_char_heuristic)::size_type bcu_idx = buffer[idx]; - bad_char_heuristic[bcu_idx] = buffer_size - idx - 1; - } - for (size_t s = 0; s <= (region_size - buffer_size);) { - int64_t j = buffer_size - 1; - while (j >= 0 && buffer[j] == iterator[s + j]) - j--; - if (j < 0) - return low + s; - else - s += bad_char_heuristic[iterator[s + buffer_size - 1]]; - } - - return LLDB_INVALID_ADDRESS; - } - OptionGroupOptions m_option_group; OptionGroupFindMemory m_memory_options; OptionGroupMemoryTag m_memory_tag_options; diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 216d2f21abfef0..1e321f8bde3919 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -112,6 +112,33 @@ class ProcessOptionValueProperties } }; +class ProcessMemoryIterator { +public: + ProcessMemoryIterator(Process &process, lldb::addr_t base) + : m_process(process), m_base_addr(base) {} + + bool IsValid() { return m_is_valid; } + + uint8_t operator[](lldb::addr_t offset) { + if (!IsValid()) + return 0; + + uint8_t retval = 0; + Status error; + if (0 == m_process.ReadMemory(m_base_addr + offset, &retval, 1, error)) { + m_is_valid = false; + return 0; + } + + return retval; + } + +private: + Process &m_process; + const lldb::addr_t m_base_addr; + bool m_is_valid = true; +}; + static constexpr OptionEnumValueElement g_follow_fork_mode_values[] = { { eFollowParent, @@ -3191,6 +3218,33 @@ Status Process::Halt(bool clear_thread_plans, bool use_run_lock) { return Status(); } +lldb::addr_t Process::FindInMemory(lldb::addr_t low, lldb::addr_t high, + const uint8_t *buf, size_t size) { + const size_t region_size = high - low; + + if (region_size < size) + return LLDB_INVALID_ADDRESS; + + std::vector bad_char_heuristic(256, size); + ProcessMemoryIterator iterator(*this, low); + + for (size_t idx = 0; idx < size - 1; idx++) { + decltype(bad_char_heuristic)::size_type bcu_idx = buf[idx]; + bad_char_heuristic[bcu_idx] = size - idx - 1; + } + for (size_t s = 0; s <= (region_size - size);) { + int64_t j = size - 1; + while (j >= 0 && buf[j] == iterator[s + j]) + j--; + if (j < 0) + return low + s; + else + s += bad_char_heuristic[iterator[s + size - 1]]; + } + + return LLDB_INVALID_ADDRESS; +} + Status Process::StopForDestroyOrDetach(lldb::EventSP &exit_event_sp) { Status error; From 9595eb10ae9a5661a596dff19bf39365140548e3 Mon Sep 17 00:00:00 2001 From: Hui Date: Wed, 29 May 2024 18:46:39 +0100 Subject: [PATCH 153/230] [libc++][test] Close LWG3018 and add tests (#93047) --- libcxx/docs/Status/Cxx20Issues.csv | 2 +- .../pointer_deleter.pass.cpp | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index 54517ab002b86b..e748ff6ad749b7 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -191,7 +191,7 @@ "","","","","","" "`1203 `__","More useful rvalue stream insertion","Prague","|Complete|","12.0" "`2859 `__","Definition of *reachable* in [ptr.launder] misses pointer arithmetic from pointer-interconvertible object","Prague","","" -"`3018 `__","``shared_ptr``\ of function type","Prague","","" +"`3018 `__","``shared_ptr``\ of function type","Prague","|Nothing To Do|","" "`3050 `__","Conversion specification problem in ``chrono::duration``\ constructor","Prague","|Complete|","19.0","|chrono|" "`3141 `__","``CopyConstructible``\ doesn't preserve source values","Prague","|Nothing to do|","" "`3150 `__","``UniformRandomBitGenerator``\ should validate ``min``\ and ``max``\ ","Prague","|Complete|","13.0","|ranges|" diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp index 9c1e9b72be573c..562acf56d96fe1 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp @@ -48,6 +48,27 @@ static_assert(!std::is_constructible, int*, bad_deleter> static_assert(!std::is_constructible, int(*)[5], test_deleter >::value, ""); #endif +int f() { return 5; } + +// https://cplusplus.github.io/LWG/issue3018 +// LWG 3018. shared_ptr of function type +struct function_pointer_deleter { + function_pointer_deleter(bool& deleter_called) : deleter_called_(deleter_called) {} + + void operator()(int (*)()) const { deleter_called_ = true; } + + bool& deleter_called_; +}; + +void test_function_type() { + bool deleter_called = false; + { + std::shared_ptr p(&f, function_pointer_deleter(deleter_called)); + assert((*p)() == 5); + } + assert(deleter_called); +} + int main(int, char**) { { @@ -94,5 +115,6 @@ int main(int, char**) } #endif // TEST_STD_VER >= 11 + test_function_type(); return 0; } From c54657887b2cd88f0745c151fec0b15a8a7d1e44 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Wed, 29 May 2024 10:50:44 -0700 Subject: [PATCH 154/230] [nfc][InstrProfWriter]Store header fields in a vector and back patch once (#93594) This is a split of https://github.com/llvm/llvm-project/pull/93346 as discussed. --- llvm/lib/ProfileData/InstrProfWriter.cpp | 62 ++++++------------------ 1 file changed, 16 insertions(+), 46 deletions(-) diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index b16714ae8b9a2d..e732882337d468 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -893,52 +893,22 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { } InfoObj->CSSummaryBuilder = nullptr; - const size_t MemProfOffset = BackPatchStartOffset + sizeof(uint64_t); - const size_t BinaryIdOffset = MemProfOffset + sizeof(uint64_t); - const size_t TemporalProfTracesOffset = BinaryIdOffset + sizeof(uint64_t); - const size_t VTableNamesOffset = TemporalProfTracesOffset + sizeof(uint64_t); - if (!WritePrevVersion) { - // Now do the final patch: - PatchItem PatchItems[] = { - // Patch the Header.HashOffset field. - {BackPatchStartOffset, &HashTableStart, 1}, - // Patch the Header.MemProfOffset (=0 for profiles without MemProf - // data). - {MemProfOffset, &MemProfSectionStart, 1}, - // Patch the Header.BinaryIdSectionOffset. - {BinaryIdOffset, &BinaryIdSectionStart, 1}, - // Patch the Header.TemporalProfTracesOffset (=0 for profiles without - // traces). - {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1}, - {VTableNamesOffset, &VTableNamesSectionStart, 1}, - // Patch the summary data. - {SummaryOffset, reinterpret_cast(TheSummary.get()), - (int)(SummarySize / sizeof(uint64_t))}, - {CSSummaryOffset, reinterpret_cast(TheCSSummary.get()), - (int)CSSummarySize}}; - - OS.patch(PatchItems); - } else { - // Now do the final patch: - PatchItem PatchItems[] = { - // Patch the Header.HashOffset field. - {BackPatchStartOffset, &HashTableStart, 1}, - // Patch the Header.MemProfOffset (=0 for profiles without MemProf - // data). - {MemProfOffset, &MemProfSectionStart, 1}, - // Patch the Header.BinaryIdSectionOffset. - {BinaryIdOffset, &BinaryIdSectionStart, 1}, - // Patch the Header.TemporalProfTracesOffset (=0 for profiles without - // traces). - {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1}, - // Patch the summary data. - {SummaryOffset, reinterpret_cast(TheSummary.get()), - (int)(SummarySize / sizeof(uint64_t))}, - {CSSummaryOffset, reinterpret_cast(TheCSSummary.get()), - (int)CSSummarySize}}; - - OS.patch(PatchItems); - } + SmallVector HeaderOffsets = {HashTableStart, MemProfSectionStart, + BinaryIdSectionStart, + TemporalProfTracesSectionStart}; + if (!WritePrevVersion) + HeaderOffsets.push_back(VTableNamesSectionStart); + + PatchItem PatchItems[] = { + // Patch the Header fields + {BackPatchStartOffset, HeaderOffsets.data(), (int)HeaderOffsets.size()}, + // Patch the summary data. + {SummaryOffset, reinterpret_cast(TheSummary.get()), + (int)(SummarySize / sizeof(uint64_t))}, + {CSSummaryOffset, reinterpret_cast(TheCSSummary.get()), + (int)CSSummarySize}}; + + OS.patch(PatchItems); for (const auto &I : FunctionData) for (const auto &F : I.getValue()) From 1f67f34a5cf993f03eca8936bfb7203778c2997a Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Wed, 20 Mar 2024 17:25:47 -0700 Subject: [PATCH 155/230] [MTE] add stack frame history buffer this will allow us to find offending objects in a symbolization step, like we can do with hwasan. needs matching changes in AOSP: https://android-review.git.corp.google.com/q/topic:%22stackhistorybuffer%22 Pull Request: https://github.com/llvm/llvm-project/pull/86356 --- .../Target/AArch64/AArch64FrameLowering.cpp | 3 +- .../Target/AArch64/AArch64StackTagging.cpp | 64 ++++++++++++++++- .../CodeGen/AArch64/stack-tagging-prologue.ll | 69 +++++++++++++++++++ 3 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index dc7759367687b7..cd532671f50189 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2500,7 +2500,8 @@ AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return resolveFrameIndexReference( MF, FI, FrameReg, /*PreferFP=*/ - MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), + MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress) || + MF.getFunction().hasFnAttribute(Attribute::SanitizeMemTag), /*ForSimm=*/false); } diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index aabc5d5d22e2d3..eab3a90e57e209 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -11,6 +11,7 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -21,6 +22,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/StackSafetyAnalysis.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -82,6 +84,26 @@ static cl::opt ClMaxLifetimes( cl::desc("How many lifetime ends to handle for a single alloca."), cl::Optional); +// Mode for selecting how to insert frame record info into the stack ring +// buffer. +enum RecordStackHistoryMode { + // Do not record frame record info. + none, + + // Insert instructions into the prologue for storing into the stack ring + // buffer directly. + instr, +}; + +static cl::opt ClRecordStackHistory( + "stack-tagging-record-stack-history", + cl::desc("Record stack frames with tagged allocations in a thread-local " + "ring buffer"), + cl::values(clEnumVal(none, "Do not record stack ring history"), + clEnumVal(instr, "Insert instructions into the prologue for " + "storing into the stack ring buffer")), + cl::Hidden, cl::init(none)); + static const Align kTagGranuleSize = Align(16); namespace { @@ -309,6 +331,7 @@ class AArch64StackTagging : public FunctionPass { uint64_t Size, InitializerBuilder &IB); Instruction *insertBaseTaggedPointer( + const Module &M, const MapVector &Allocas, const DominatorTree *DT); bool runOnFunction(Function &F) override; @@ -437,6 +460,7 @@ void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore, } Instruction *AArch64StackTagging::insertBaseTaggedPointer( + const Module &M, const MapVector &AllocasToInstrument, const DominatorTree *DT) { BasicBlock *PrologueBB = nullptr; @@ -458,6 +482,41 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer( Instruction *Base = IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())}); Base->setName("basetag"); + auto TargetTriple = Triple(M.getTargetTriple()); + // This is not a stable ABI for now, so only allow in dev builds with API + // level 10000. + // The ThreadLong format is the same as with HWASan, but the entries for + // stack MTE take two slots (16 bytes). + if (ClRecordStackHistory == instr && TargetTriple.isAndroid() && + TargetTriple.isAArch64() && !TargetTriple.isAndroidVersionLT(10000) && + !AllocasToInstrument.empty()) { + constexpr int StackMteSlot = -3; + constexpr uint64_t TagMask = 0xFULL << 56; + + auto *IntptrTy = IRB.getIntPtrTy(M.getDataLayout()); + Value *SlotPtr = memtag::getAndroidSlotPtr(IRB, StackMteSlot); + auto *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); + Value *TaggedFP = IRB.CreateOr( + memtag::getFP(IRB), + IRB.CreateAnd(IRB.CreatePtrToInt(Base, IntptrTy), TagMask)); + Value *PC = memtag::getPC(TargetTriple, IRB); + Value *RecordPtr = IRB.CreateIntToPtr(ThreadLong, IRB.getPtrTy(0)); + IRB.CreateStore(PC, RecordPtr); + IRB.CreateStore(TaggedFP, IRB.CreateConstGEP1_64(IntptrTy, RecordPtr, 1)); + // Update the ring buffer. Top byte of ThreadLong defines the size of the + // buffer in pages, it must be a power of two, and the start of the buffer + // must be aligned by twice that much. Therefore wrap around of the ring + // buffer is simply Addr &= ~((ThreadLong >> 56) << 12). + // The use of AShr instead of LShr is due to + // https://bugs.llvm.org/show_bug.cgi?id=39030 + // Runtime library makes sure not to use the highest bit. + Value *WrapMask = IRB.CreateXor( + IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true), + ConstantInt::get(IntptrTy, (uint64_t)-1)); + Value *ThreadLongNew = IRB.CreateAnd( + IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 16)), WrapMask); + IRB.CreateStore(ThreadLongNew, SlotPtr); + } return Base; } @@ -513,7 +572,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { SetTagFunc = Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag); - Instruction *Base = insertBaseTaggedPointer(SInfo.AllocasToInstrument, DT); + Instruction *Base = + insertBaseTaggedPointer(*Fn.getParent(), SInfo.AllocasToInstrument, DT); int NextTag = 0; for (auto &I : SInfo.AllocasToInstrument) { @@ -575,6 +635,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { for (auto *II : Info.LifetimeEnd) II->eraseFromParent(); } + + memtag::annotateDebugRecords(Info, static_cast(Tag)); } // If we have instrumented at least one alloca, all unrecognized lifetime diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll new file mode 100644 index 00000000000000..3f55f3cc9a2e2e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll @@ -0,0 +1,69 @@ +; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -o - | FileCheck %s --check-prefixes=CHECK +; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -stack-tagging-record-stack-history=instr -o - | FileCheck %s --check-prefixes=INSTR +; RUN llc -mattr=+mte -stack-tagging-use-stack-safety=0 -stack-tagging-record-stack-history=instr %s -o - | FileCheck %s --check-prefixes=ASMINSTR + + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android10000" + +declare void @use8(ptr) +declare void @use32(ptr) +declare void @llvm.lifetime.start.p0(i64, ptr nocapture) +declare void @llvm.lifetime.end.p0(i64, ptr nocapture) + +define dso_local void @noUse32(ptr) sanitize_memtag { +entry: + ret void +} + +define void @OneVar() sanitize_memtag { +entry: + %x = alloca i32, align 4 + call void @use32(ptr %x) + ret void +} + +; CHECK-LABEL: define void @OneVar( +; CHECK: [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0) +; CHECK: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 +; CHECK: [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0) +; CHECK: ret void + +; INSTR-LABEL: define void @OneVar( +; INSTR: [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0) +; INSTR: [[TLS:%.*]] = call ptr @llvm.thread.pointer() +; INSTR: [[TLS_SLOT:%.*]] = getelementptr i8, ptr [[TLS]], i32 -24 +; INSTR: [[TLS_VALUE:%.*]] = load i64, ptr %1, align 8 +; INSTR: [[FP:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) +; INSTR: [[FP_INT:%.*]] = ptrtoint ptr %3 to i64 +; INSTR: [[BASE_INT:%.*]] = ptrtoint ptr %basetag to i64 +; INSTR: [[BASE_TAG:%.*]] = and i64 [[BASE_INT]], 1080863910568919040 +; INSTR: [[TAGGED_FP:%.*]] = or i64 [[FP_INT]], [[BASE_TAG]] +; INSTR: [[PC:%.*]] = call i64 @llvm.read_register.i64(metadata !0) +; INSTR: [[TLS_VALUE_PTR:%.*]] = inttoptr i64 [[TLS_VALUE]] to ptr +; INSTR: store i64 [[PC]], ptr [[TLS_VALUE_PTR]], align 8 +; INSTR: [[SECOND_SLOT:%.*]] = getelementptr i64, ptr [[TLS_VALUE_PTR]], i64 1 +; INSTR: store i64 [[TAGGED_FP]], ptr [[SECOND_SLOT]], align 8 +; INSTR: [[SIZE_IN_PAGES:%.*]] = ashr i64 [[TLS_VALUE]], 56 +; INSTR: [[WRAP_MASK_INTERMEDIARY:%.*]] = shl nuw nsw i64 [[SIZE_IN_PAGES]], 12 +; INSTR: [[WRAP_MASK:%.*]] = xor i64 [[WRAP_MASK_INTERMEDIARY]], -1 +; INSTR: [[NEXT_TLS_VALUE_BEFORE_WRAP:%.*]] = add i64 [[TLS_VALUE]], 16 +; INSTR: [[NEXT_TLS_VALUE:%.*]] = and i64 [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[WRAP_MASK]] +; INSTR: store i64 [[NEXT_TLS_VALUE]], ptr [[TLS_SLOT]], align 8 +; INSTR: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 +; INSTR: [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0) +; INSTR: [[PC:!.*]] = !{!"pc"} + +; ASMINSTR-LABEL: OneVar: +; ASMINSTR: mrs [[TLS:x.*]], TPIDR_EL0 +; ASMINSTR: irg [[BASE:x.*]], sp +; ASMINSTR: adr [[PC:x.*]], #0 +; ASMINSTR: ldur [[TLS_SLOT:x.*]], [[[TLS]], #-24] +; ASMINSTR: and [[SP_TAG:x.*]], [[BASE]], #0xf00000000000000 +; ASMINSTR: orr [[TAGGED_FP]], x29, [[SP_TAG]] +; ASMINSTR: asr [[TLS_SIZE:x.*]], [[TLS_SLOT]], #56 +; ASMINSTR: add [[NEXT_TLS_VALUE_BEFORE_WRAP:x.*]], [[TLS_SLOT]], #16 +; ASMINSTR: stp [[PC]], [[TAGGED_FP]], [[[TLS_SLOT]]] +; ASMINSTR: bic [[NEXT_TLS_VALUE:x.*]], [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[TLS_SIZE]], lsl #12 +; ASMINSTR: stur [[NEXT_TLS_VALUE]], [[[TLS]], #-24] +; ASMINSTR: stg [[BASE]], [[[BASE]]] From 3313f28897a87ec313ec0b52ef71c14d3b9ff652 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Wed, 29 May 2024 11:21:29 -0700 Subject: [PATCH 156/230] Revert "[MTE] add stack frame history buffer" This reverts commit 1f67f34a5cf993f03eca8936bfb7203778c2997a. --- .../Target/AArch64/AArch64FrameLowering.cpp | 3 +- .../Target/AArch64/AArch64StackTagging.cpp | 64 +---------------- .../CodeGen/AArch64/stack-tagging-prologue.ll | 69 ------------------- 3 files changed, 2 insertions(+), 134 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index cd532671f50189..dc7759367687b7 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2500,8 +2500,7 @@ AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return resolveFrameIndexReference( MF, FI, FrameReg, /*PreferFP=*/ - MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress) || - MF.getFunction().hasFnAttribute(Attribute::SanitizeMemTag), + MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), /*ForSimm=*/false); } diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index eab3a90e57e209..aabc5d5d22e2d3 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -11,7 +11,6 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" -#include "llvm/ADT/APInt.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -22,7 +21,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/StackSafetyAnalysis.h" -#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -84,26 +82,6 @@ static cl::opt ClMaxLifetimes( cl::desc("How many lifetime ends to handle for a single alloca."), cl::Optional); -// Mode for selecting how to insert frame record info into the stack ring -// buffer. -enum RecordStackHistoryMode { - // Do not record frame record info. - none, - - // Insert instructions into the prologue for storing into the stack ring - // buffer directly. - instr, -}; - -static cl::opt ClRecordStackHistory( - "stack-tagging-record-stack-history", - cl::desc("Record stack frames with tagged allocations in a thread-local " - "ring buffer"), - cl::values(clEnumVal(none, "Do not record stack ring history"), - clEnumVal(instr, "Insert instructions into the prologue for " - "storing into the stack ring buffer")), - cl::Hidden, cl::init(none)); - static const Align kTagGranuleSize = Align(16); namespace { @@ -331,7 +309,6 @@ class AArch64StackTagging : public FunctionPass { uint64_t Size, InitializerBuilder &IB); Instruction *insertBaseTaggedPointer( - const Module &M, const MapVector &Allocas, const DominatorTree *DT); bool runOnFunction(Function &F) override; @@ -460,7 +437,6 @@ void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore, } Instruction *AArch64StackTagging::insertBaseTaggedPointer( - const Module &M, const MapVector &AllocasToInstrument, const DominatorTree *DT) { BasicBlock *PrologueBB = nullptr; @@ -482,41 +458,6 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer( Instruction *Base = IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())}); Base->setName("basetag"); - auto TargetTriple = Triple(M.getTargetTriple()); - // This is not a stable ABI for now, so only allow in dev builds with API - // level 10000. - // The ThreadLong format is the same as with HWASan, but the entries for - // stack MTE take two slots (16 bytes). - if (ClRecordStackHistory == instr && TargetTriple.isAndroid() && - TargetTriple.isAArch64() && !TargetTriple.isAndroidVersionLT(10000) && - !AllocasToInstrument.empty()) { - constexpr int StackMteSlot = -3; - constexpr uint64_t TagMask = 0xFULL << 56; - - auto *IntptrTy = IRB.getIntPtrTy(M.getDataLayout()); - Value *SlotPtr = memtag::getAndroidSlotPtr(IRB, StackMteSlot); - auto *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); - Value *TaggedFP = IRB.CreateOr( - memtag::getFP(IRB), - IRB.CreateAnd(IRB.CreatePtrToInt(Base, IntptrTy), TagMask)); - Value *PC = memtag::getPC(TargetTriple, IRB); - Value *RecordPtr = IRB.CreateIntToPtr(ThreadLong, IRB.getPtrTy(0)); - IRB.CreateStore(PC, RecordPtr); - IRB.CreateStore(TaggedFP, IRB.CreateConstGEP1_64(IntptrTy, RecordPtr, 1)); - // Update the ring buffer. Top byte of ThreadLong defines the size of the - // buffer in pages, it must be a power of two, and the start of the buffer - // must be aligned by twice that much. Therefore wrap around of the ring - // buffer is simply Addr &= ~((ThreadLong >> 56) << 12). - // The use of AShr instead of LShr is due to - // https://bugs.llvm.org/show_bug.cgi?id=39030 - // Runtime library makes sure not to use the highest bit. - Value *WrapMask = IRB.CreateXor( - IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true), - ConstantInt::get(IntptrTy, (uint64_t)-1)); - Value *ThreadLongNew = IRB.CreateAnd( - IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 16)), WrapMask); - IRB.CreateStore(ThreadLongNew, SlotPtr); - } return Base; } @@ -572,8 +513,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { SetTagFunc = Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag); - Instruction *Base = - insertBaseTaggedPointer(*Fn.getParent(), SInfo.AllocasToInstrument, DT); + Instruction *Base = insertBaseTaggedPointer(SInfo.AllocasToInstrument, DT); int NextTag = 0; for (auto &I : SInfo.AllocasToInstrument) { @@ -635,8 +575,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { for (auto *II : Info.LifetimeEnd) II->eraseFromParent(); } - - memtag::annotateDebugRecords(Info, static_cast(Tag)); } // If we have instrumented at least one alloca, all unrecognized lifetime diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll deleted file mode 100644 index 3f55f3cc9a2e2e..00000000000000 --- a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -o - | FileCheck %s --check-prefixes=CHECK -; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -stack-tagging-record-stack-history=instr -o - | FileCheck %s --check-prefixes=INSTR -; RUN llc -mattr=+mte -stack-tagging-use-stack-safety=0 -stack-tagging-record-stack-history=instr %s -o - | FileCheck %s --check-prefixes=ASMINSTR - - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64--linux-android10000" - -declare void @use8(ptr) -declare void @use32(ptr) -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) - -define dso_local void @noUse32(ptr) sanitize_memtag { -entry: - ret void -} - -define void @OneVar() sanitize_memtag { -entry: - %x = alloca i32, align 4 - call void @use32(ptr %x) - ret void -} - -; CHECK-LABEL: define void @OneVar( -; CHECK: [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0) -; CHECK: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; CHECK: [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0) -; CHECK: ret void - -; INSTR-LABEL: define void @OneVar( -; INSTR: [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0) -; INSTR: [[TLS:%.*]] = call ptr @llvm.thread.pointer() -; INSTR: [[TLS_SLOT:%.*]] = getelementptr i8, ptr [[TLS]], i32 -24 -; INSTR: [[TLS_VALUE:%.*]] = load i64, ptr %1, align 8 -; INSTR: [[FP:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) -; INSTR: [[FP_INT:%.*]] = ptrtoint ptr %3 to i64 -; INSTR: [[BASE_INT:%.*]] = ptrtoint ptr %basetag to i64 -; INSTR: [[BASE_TAG:%.*]] = and i64 [[BASE_INT]], 1080863910568919040 -; INSTR: [[TAGGED_FP:%.*]] = or i64 [[FP_INT]], [[BASE_TAG]] -; INSTR: [[PC:%.*]] = call i64 @llvm.read_register.i64(metadata !0) -; INSTR: [[TLS_VALUE_PTR:%.*]] = inttoptr i64 [[TLS_VALUE]] to ptr -; INSTR: store i64 [[PC]], ptr [[TLS_VALUE_PTR]], align 8 -; INSTR: [[SECOND_SLOT:%.*]] = getelementptr i64, ptr [[TLS_VALUE_PTR]], i64 1 -; INSTR: store i64 [[TAGGED_FP]], ptr [[SECOND_SLOT]], align 8 -; INSTR: [[SIZE_IN_PAGES:%.*]] = ashr i64 [[TLS_VALUE]], 56 -; INSTR: [[WRAP_MASK_INTERMEDIARY:%.*]] = shl nuw nsw i64 [[SIZE_IN_PAGES]], 12 -; INSTR: [[WRAP_MASK:%.*]] = xor i64 [[WRAP_MASK_INTERMEDIARY]], -1 -; INSTR: [[NEXT_TLS_VALUE_BEFORE_WRAP:%.*]] = add i64 [[TLS_VALUE]], 16 -; INSTR: [[NEXT_TLS_VALUE:%.*]] = and i64 [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[WRAP_MASK]] -; INSTR: store i64 [[NEXT_TLS_VALUE]], ptr [[TLS_SLOT]], align 8 -; INSTR: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; INSTR: [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0) -; INSTR: [[PC:!.*]] = !{!"pc"} - -; ASMINSTR-LABEL: OneVar: -; ASMINSTR: mrs [[TLS:x.*]], TPIDR_EL0 -; ASMINSTR: irg [[BASE:x.*]], sp -; ASMINSTR: adr [[PC:x.*]], #0 -; ASMINSTR: ldur [[TLS_SLOT:x.*]], [[[TLS]], #-24] -; ASMINSTR: and [[SP_TAG:x.*]], [[BASE]], #0xf00000000000000 -; ASMINSTR: orr [[TAGGED_FP]], x29, [[SP_TAG]] -; ASMINSTR: asr [[TLS_SIZE:x.*]], [[TLS_SLOT]], #56 -; ASMINSTR: add [[NEXT_TLS_VALUE_BEFORE_WRAP:x.*]], [[TLS_SLOT]], #16 -; ASMINSTR: stp [[PC]], [[TAGGED_FP]], [[[TLS_SLOT]]] -; ASMINSTR: bic [[NEXT_TLS_VALUE:x.*]], [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[TLS_SIZE]], lsl #12 -; ASMINSTR: stur [[NEXT_TLS_VALUE]], [[[TLS]], #-24] -; ASMINSTR: stg [[BASE]], [[[BASE]]] From 1a2f3309765fdc143fdc3809211fb85d2e2ca341 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Wed, 29 May 2024 15:23:44 -0300 Subject: [PATCH 157/230] [clang] Improve ast-dumper text printing of TemplateArgument (#93431) This improves and unifies our approach to printing all template arguments. The same approach to printing types is extended to all TemplateArguments: A sugared version is printed in quotes, followed by printing the canonical form, unless they would print the same. Special improvements are done to add more detail to template template arguments. It's planned in a future patch to use this improved TemplateName printer for other places besides TemplateArguments. Note: The sugared/desugared printing does not show up for TemplateNames in tests yet, because we do a poor job of preserving their type sugar. This will be improved in a future patch. --- clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/AST/TextNodeDumper.h | 2 + clang/lib/AST/TextNodeDumper.cpp | 104 +++++++++++++++--- clang/test/AST/ast-dump-decl.cpp | 25 +++-- ...penmp-begin-declare-variant_template_2.cpp | 6 +- clang/test/AST/ast-dump-template-name.cpp | 54 +++++++++ clang/test/AST/ast-dump-using-template.cpp | 8 +- .../constraints-explicit-instantiation.cpp | 6 +- clang/test/OpenMP/align_clause_ast_print.cpp | 2 +- clang/test/OpenMP/generic_loop_ast_print.cpp | 2 +- clang/test/OpenMP/interop_ast_print.cpp | 2 +- clang/test/SemaOpenACC/sub-array-ast.cpp | 2 +- .../aggregate-deduction-candidate.cpp | 18 +-- clang/test/SemaTemplate/attributes.cpp | 64 +++++------ clang/test/SemaTemplate/deduction-guide.cpp | 19 ++-- clang/test/SemaTemplate/make_integer_seq.cpp | 68 +++++++----- clang/test/SemaTemplate/type_pack_element.cpp | 20 ++-- 17 files changed, 276 insertions(+), 128 deletions(-) create mode 100644 clang/test/AST/ast-dump-template-name.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index bd92818f0c09d0..e1c6d55eeeacdf 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -98,6 +98,8 @@ ABI Changes in This Version AST Dumping Potentially Breaking Changes ---------------------------------------- +- The text ast-dumper has improved printing of TemplateArguments. + Clang Frontend Potentially Breaking Changes ------------------------------------------- - Removed support for constructing on-stack ``TemplateArgumentList``\ s; interfaces should instead diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h index 1fede6e462e925..63fa16c9ec47c1 100644 --- a/clang/include/clang/AST/TextNodeDumper.h +++ b/clang/include/clang/AST/TextNodeDumper.h @@ -213,6 +213,8 @@ class TextNodeDumper void dumpTemplateSpecializationKind(TemplateSpecializationKind TSK); void dumpNestedNameSpecifier(const NestedNameSpecifier *NNS); void dumpConceptReference(const ConceptReference *R); + void dumpTemplateArgument(const TemplateArgument &TA); + void dumpTemplateName(TemplateName TN); void dumpDeclRef(const Decl *D, StringRef Label = {}); diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 4a1e94ffe283ba..627f8d3477d4e6 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -947,6 +947,26 @@ void TextNodeDumper::dumpDeclRef(const Decl *D, StringRef Label) { }); } +void TextNodeDumper::dumpTemplateArgument(const TemplateArgument &TA) { + llvm::SmallString<128> Str; + { + llvm::raw_svector_ostream SS(Str); + TA.print(PrintPolicy, SS, /*IncludeType=*/true); + } + OS << " '" << Str << "'"; + + if (TemplateArgument CanonTA = Context->getCanonicalTemplateArgument(TA); + !CanonTA.structurallyEquals(TA)) { + llvm::SmallString<128> CanonStr; + { + llvm::raw_svector_ostream SS(CanonStr); + CanonTA.print(PrintPolicy, SS, /*IncludeType=*/true); + } + if (CanonStr != Str) + OS << ":'" << CanonStr << "'"; + } +} + const char *TextNodeDumper::getCommandName(unsigned CommandID) { if (Traits) return Traits->getCommandInfo(CommandID)->Name; @@ -1086,45 +1106,101 @@ void TextNodeDumper::VisitNullTemplateArgument(const TemplateArgument &) { void TextNodeDumper::VisitTypeTemplateArgument(const TemplateArgument &TA) { OS << " type"; - dumpType(TA.getAsType()); + dumpTemplateArgument(TA); } void TextNodeDumper::VisitDeclarationTemplateArgument( const TemplateArgument &TA) { OS << " decl"; + dumpTemplateArgument(TA); dumpDeclRef(TA.getAsDecl()); } -void TextNodeDumper::VisitNullPtrTemplateArgument(const TemplateArgument &) { +void TextNodeDumper::VisitNullPtrTemplateArgument(const TemplateArgument &TA) { OS << " nullptr"; + dumpTemplateArgument(TA); } void TextNodeDumper::VisitIntegralTemplateArgument(const TemplateArgument &TA) { - OS << " integral " << TA.getAsIntegral(); + OS << " integral"; + dumpTemplateArgument(TA); +} + +void TextNodeDumper::dumpTemplateName(TemplateName TN) { + switch (TN.getKind()) { + case TemplateName::Template: + AddChild([=] { Visit(TN.getAsTemplateDecl()); }); + return; + case TemplateName::UsingTemplate: { + const UsingShadowDecl *USD = TN.getAsUsingShadowDecl(); + AddChild([=] { Visit(USD); }); + AddChild("target", [=] { Visit(USD->getTargetDecl()); }); + return; + } + case TemplateName::QualifiedTemplate: { + OS << " qualified"; + const QualifiedTemplateName *QTN = TN.getAsQualifiedTemplateName(); + if (QTN->hasTemplateKeyword()) + OS << " keyword"; + dumpNestedNameSpecifier(QTN->getQualifier()); + dumpTemplateName(QTN->getUnderlyingTemplate()); + return; + } + case TemplateName::DependentTemplate: { + OS << " dependent"; + const DependentTemplateName *DTN = TN.getAsDependentTemplateName(); + dumpNestedNameSpecifier(DTN->getQualifier()); + return; + } + case TemplateName::SubstTemplateTemplateParm: { + OS << " subst"; + const SubstTemplateTemplateParmStorage *STS = + TN.getAsSubstTemplateTemplateParm(); + OS << " index " << STS->getIndex(); + if (std::optional PackIndex = STS->getPackIndex()) + OS << " pack_index " << *PackIndex; + if (const TemplateTemplateParmDecl *P = STS->getParameter()) + AddChild("parameter", [=] { Visit(P); }); + dumpDeclRef(STS->getAssociatedDecl(), "associated"); + AddChild("replacement", [=] { dumpTemplateName(STS->getReplacement()); }); + return; + } + // FIXME: Implement these. + case TemplateName::OverloadedTemplate: + OS << " overloaded"; + return; + case TemplateName::AssumedTemplate: + OS << " assumed"; + return; + case TemplateName::SubstTemplateTemplateParmPack: + OS << " subst_pack"; + return; + } + llvm_unreachable("Unexpected TemplateName Kind"); } void TextNodeDumper::VisitTemplateTemplateArgument(const TemplateArgument &TA) { - if (TA.getAsTemplate().getKind() == TemplateName::UsingTemplate) - OS << " using"; - OS << " template "; - TA.getAsTemplate().dump(OS); + OS << " template"; + dumpTemplateArgument(TA); + dumpTemplateName(TA.getAsTemplate()); } void TextNodeDumper::VisitTemplateExpansionTemplateArgument( const TemplateArgument &TA) { - if (TA.getAsTemplateOrTemplatePattern().getKind() == - TemplateName::UsingTemplate) - OS << " using"; - OS << " template expansion "; - TA.getAsTemplateOrTemplatePattern().dump(OS); + OS << " template expansion"; + dumpTemplateArgument(TA); + dumpTemplateName(TA.getAsTemplateOrTemplatePattern()); } -void TextNodeDumper::VisitExpressionTemplateArgument(const TemplateArgument &) { +void TextNodeDumper::VisitExpressionTemplateArgument( + const TemplateArgument &TA) { OS << " expr"; + dumpTemplateArgument(TA); } -void TextNodeDumper::VisitPackTemplateArgument(const TemplateArgument &) { +void TextNodeDumper::VisitPackTemplateArgument(const TemplateArgument &TA) { OS << " pack"; + dumpTemplateArgument(TA); } static void dumpBasePath(raw_ostream &OS, const CastExpr *Node) { diff --git a/clang/test/AST/ast-dump-decl.cpp b/clang/test/AST/ast-dump-decl.cpp index e062d4f068a403..b861ba8be15b50 100644 --- a/clang/test/AST/ast-dump-decl.cpp +++ b/clang/test/AST/ast-dump-decl.cpp @@ -459,21 +459,23 @@ namespace testClassTemplateDecl { // CHECK: ClassTemplateDecl 0x{{.+}} <{{.+}}:[[@LINE-148]]:3, col:31> col:31 TestTemplateDefaultNonType{{$}} // CHECK-NEXT: |-NonTypeTemplateParmDecl 0x{{.+}} col:16 'int' depth 0 index 0 I{{$}} -// CHECK-NEXT: | `-TemplateArgument expr{{$}} +// CHECK-NEXT: | `-TemplateArgument expr '42'{{$}} // CHECK-NEXT: | `-IntegerLiteral 0x{{.+}} 'int' 42{{$}} // CHECK-NEXT: `-CXXRecordDecl 0x{{.+}} col:31 struct TestTemplateDefaultNonType{{$}} // CHECK: ClassTemplateDecl 0x{{.+}} <{{.+}}:{{.*}}:3, col:68> col:68 TestTemplateTemplateDefaultType{{$}} // CHECK-NEXT: |-TemplateTemplateParmDecl 0x{{.+}} col:37 depth 0 index 0 TT{{$}} // CHECK-NEXT: | |-TemplateTypeParmDecl 0x{{.+}} col:29 typename depth 1 index 0{{$}} -// CHECK-NEXT: | `-TemplateArgument template TestClassTemplate{{$}} -// CHECK-NEXT: `-CXXRecordDecl 0x{{.+}} col:68 struct TestTemplateTemplateDefaultType{{$}} +// CHECK-NEXT: | `-TemplateArgument template 'testClassTemplateDecl::TestClassTemplate'{{$}} +// CHECK-NEXT: | `-ClassTemplateDecl 0x{{.+}} line:{{.+}}:30 TestClassTemplate{{$}} +// CHECK-NEXT: `-CXXRecordDecl 0x{{.+}} col:68 struct TestTemplateTemplateDefaultType{{$}} // CHECK: ClassTemplateDecl 0x{{.+}} prev 0x{{.+}} <{{.+}}:{{.*}}:3, col:82> col:48 TestTemplateTemplateDefaultType{{$}} // CHECK-NEXT: |-TemplateTemplateParmDecl 0x{{.+}} col:37 depth 0 index 0 TT{{$}} // CHECK-NEXT: | |-TemplateTypeParmDecl 0x{{.+}} col:29 typename depth 1 index 0{{$}} -// CHECK-NEXT: | `-TemplateArgument template TestClassTemplate{{$}} -// CHECK-NEXT: | `-inherited from TemplateTemplateParm 0x{{.+}} 'TT'{{$}} +// CHECK-NEXT: | `-TemplateArgument template 'testClassTemplateDecl::TestClassTemplate'{{$}} +// CHECK-NEXT: | |-inherited from TemplateTemplateParm 0x{{.+}} 'TT'{{$}} +// CHECK-NEXT: | `-ClassTemplateDecl 0x{{.+}} line:{{.+}}:30 TestClassTemplate // CHECK-NEXT: `-CXXRecordDecl 0x{{.+}} prev 0x{{.+}} col:48 struct TestTemplateTemplateDefaultType definition{{$}} // CHECK-NEXT: |-DefinitionData empty aggregate standard_layout trivially_copyable pod trivial literal has_constexpr_non_copy_move_ctor can_const_default_init{{$}} // CHECK-NEXT: | |-DefaultConstructor exists trivial constexpr needs_implicit defaulted_is_constexpr{{$}} @@ -683,7 +685,8 @@ namespace TestTemplateTemplateParmDecl { // CHECK: FunctionTemplateDecl // CHECK-NEXT: TemplateTemplateParmDecl{{.*}} T // CHECK-NEXT: TemplateTypeParmDecl{{.*}} typename -// CHECK-NEXT: TemplateArgument{{.*}} template A +// CHECK-NEXT: TemplateArgument{{.*}} template 'TestTemplateTemplateParmDecl::A' +// CHECK-NEXT: ClassTemplateDecl {{.*}} A // CHECK-NEXT: TemplateTemplateParmDecl{{.*}} ... U // CHECK-NEXT: TemplateTypeParmDecl{{.*}} typename @@ -710,12 +713,12 @@ namespace TestTemplateArgument { template class testIntegral { }; template class testIntegral<1>; // CHECK: ClassTemplateSpecializationDecl{{.*}} class testIntegral - // CHECK: TemplateArgument{{.*}} integral 1 + // CHECK: TemplateArgument{{.*}} integral '1' template class> class testTemplate { }; template class testTemplate; // CHECK: ClassTemplateSpecializationDecl{{.*}} class testTemplate - // CHECK: TemplateArgument{{.*}} A + // CHECK: TemplateArgument{{.*}} 'TestTemplateArgument::A' template class ...T> class C { B testTemplateExpansion; @@ -731,10 +734,10 @@ namespace TestTemplateArgument { template class testPack { }; template class testPack<0, 1, 2>; // CHECK: ClassTemplateSpecializationDecl{{.*}} class testPack - // CHECK: TemplateArgument{{.*}} integral 0 + // CHECK: TemplateArgument{{.*}} integral '0' // CHECK-NEXT: TemplateArgument{{.*}} pack - // CHECK-NEXT: TemplateArgument{{.*}} integral 1 - // CHECK-NEXT: TemplateArgument{{.*}} integral 2 + // CHECK-NEXT: TemplateArgument{{.*}} integral '1' + // CHECK-NEXT: TemplateArgument{{.*}} integral '2' } namespace testUsingDecl { diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp index da46cef7f3f1bc..6fe05e33a5fb87 100644 --- a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp +++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp @@ -79,7 +79,7 @@ int test() { // CHECK-NEXT: | | `-ReturnStmt [[ADDR_22:0x[a-z0-9]*]] // CHECK-NEXT: | | `-IntegerLiteral [[ADDR_23:0x[a-z0-9]*]] 'int' 0 // CHECK-NEXT: | `-FunctionDecl [[ADDR_24:0x[a-z0-9]*]] line:9:5 used also_before_mismatch 'int ({{.*}})' -// CHECK-NEXT: | |-TemplateArgument integral 0 +// CHECK-NEXT: | |-TemplateArgument integral '0' // CHECK-NEXT: | `-CompoundStmt [[ADDR_25:0x[a-z0-9]*]] // CHECK-NEXT: | `-ReturnStmt [[ADDR_26:0x[a-z0-9]*]] // CHECK-NEXT: | `-IntegerLiteral [[ADDR_23]] 'int' 0 @@ -179,7 +179,7 @@ int test() { // CHECK-NEXT: | | `-OMPDeclareVariantAttr [[ADDR_101:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)} // CHECK-NEXT: | | `-DeclRefExpr [[ADDR_102:0x[a-z0-9]*]] 'int ({{.*}})' {{.*}}Function [[ADDR_103:0x[a-z0-9]*]] 'only_def[implementation={extension(allow_templates)}]' 'int ({{.*}})' // CHECK-NEXT: | `-FunctionDecl [[ADDR_104:0x[a-z0-9]*]] col:5 used only_def 'int ({{.*}})' -// CHECK-NEXT: | |-TemplateArgument integral 0 +// CHECK-NEXT: | |-TemplateArgument integral '0' // CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_105:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)} // CHECK-NEXT: | `-DeclRefExpr [[ADDR_106:0x[a-z0-9]*]] 'int ({{.*}})' {{.*}}Function [[ADDR_107:0x[a-z0-9]*]] 'only_def[implementation={extension(allow_templates)}]' 'int ({{.*}})' // CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_108:0x[a-z0-9]*]] line:38:1 only_def[implementation={extension(allow_templates)}] @@ -189,7 +189,7 @@ int test() { // CHECK-NEXT: | | `-ReturnStmt [[ADDR_110:0x[a-z0-9]*]] // CHECK-NEXT: | | `-IntegerLiteral [[ADDR_111:0x[a-z0-9]*]] 'int' 0 // CHECK-NEXT: | `-FunctionDecl [[ADDR_107]] line:38:1 only_def[implementation={extension(allow_templates)}] 'int ({{.*}})' -// CHECK-NEXT: | |-TemplateArgument integral 0 +// CHECK-NEXT: | |-TemplateArgument integral '0' // CHECK-NEXT: | `-CompoundStmt [[ADDR_112:0x[a-z0-9]*]] // CHECK-NEXT: | `-ReturnStmt [[ADDR_113:0x[a-z0-9]*]] // CHECK-NEXT: | `-IntegerLiteral [[ADDR_111]] 'int' 0 diff --git a/clang/test/AST/ast-dump-template-name.cpp b/clang/test/AST/ast-dump-template-name.cpp new file mode 100644 index 00000000000000..39100711b60a13 --- /dev/null +++ b/clang/test/AST/ast-dump-template-name.cpp @@ -0,0 +1,54 @@ +// RUN: %clang_cc1 -std=c++26 -ast-dump -ast-dump-filter=Test %s | FileCheck %s + +template